In [1]:
import pandas as pd

import nltk, re, os
import nltk.corpus
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.probability import FreqDist
from nltk.tag.stanford import StanfordNERTagger
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree
from nltk import RegexpParser
from nltk.chunk.api import ChunkParserI

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette = 'Set2', )
%matplotlib inline

from pprint import pprint

In [2]:
text = ("Hello, you've called virtual bank. This is Marina speaking. How may I help you? Hello, Marina. I forgot my banking application password. What should I do? You have to press on forgot password button on the application screen and we'll send you a security number to your phone number. Oh, that's bad. I didn't use my old phone number anymore, but I haven't changed my details in the bank information. That's fine. Could I have your account number? That's one one one five four four two two two? Okay. And what's your identification number? Okay. My identification number is 1 1 0 2 5 6 9 8 5 4 2 9800 11-2 Okay, I have one one zero two five six nine eight. Five four two 9800 11-2. Yes. That's right. And what's your name ma'am? My name is Amanda Nelson. Okay. Mrs. Nelson. What's your phone number ma'am? My phone number is seven. Seven one nine eight five five five eight seven. I just replaced your phone number, and you can reset your password in the application by press on the forgot password, and we'll send you a security number to your phone. Thank you. Thank you for using our service Miss Nelson. Have a good day.")

## Information Extraction

In [3]:
def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences

In [4]:
ie_preprocess(text)

[[('Hello', 'NNP'),
  (',', ','),
  ('you', 'PRP'),
  ("'ve", 'VBP'),
  ('called', 'VBN'),
  ('virtual', 'JJ'),
  ('bank', 'NN'),
  ('.', '.')],
 [('This', 'DT'),
  ('is', 'VBZ'),
  ('Marina', 'NNP'),
  ('speaking', 'NN'),
  ('.', '.')],
 [('How', 'WRB'),
  ('may', 'MD'),
  ('I', 'PRP'),
  ('help', 'VB'),
  ('you', 'PRP'),
  ('?', '.')],
 [('Hello', 'NNP'), (',', ','), ('Marina', 'NNP'), ('.', '.')],
 [('I', 'PRP'),
  ('forgot', 'VBD'),
  ('my', 'PRP$'),
  ('banking', 'NN'),
  ('application', 'NN'),
  ('password', 'NN'),
  ('.', '.')],
 [('What', 'WP'), ('should', 'MD'), ('I', 'PRP'), ('do', 'VB'), ('?', '.')],
 [('You', 'PRP'),
  ('have', 'VBP'),
  ('to', 'TO'),
  ('press', 'VB'),
  ('on', 'IN'),
  ('forgot', 'JJ'),
  ('password', 'JJ'),
  ('button', 'NN'),
  ('on', 'IN'),
  ('the', 'DT'),
  ('application', 'NN'),
  ('screen', 'NN'),
  ('and', 'CC'),
  ('we', 'PRP'),
  ("'ll", 'MD'),
  ('send', 'VB'),
  ('you', 'PRP'),
  ('a', 'DT'),
  ('security', 'NN'),
  ('number', 'NN'),
  ('to', 

## Detecting named entity

In [5]:
for sent in nltk.sent_tokenize(text):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))

GPE Hello
GPE Marina
GPE Hello
GPE Marina
GPE Okay
GPE Okay
PERSON Amanda Nelson
GPE Okay
PERSON Nelson
PERSON Miss Nelson


### Real model

In [6]:
# Stanford NER tagger
def stanford_tagger(document):
    java_path = ("C:/Program Files/Java/jdk-15.0.1/bin/java.exe")
    os.environ['JAVAHOME'] = java_path

    jar = ('D:/Program/stanford-ner-4.0.0/stanford-ner.jar')
    model = ('D:/Program/stanford-ner-4.0.0/classifiers/english.muc.7class.distsim.crf.ser') # 7 classes
    st = StanfordNERTagger(model, jar, encoding = 'utf-8')
    
    word_token = word_tokenize(document)
    classified_text = st.tag(word_token)
    
    return classified_text

# NLTK POS and NER taggers   
def nltk_tagger(document):
    word_token = word_tokenize(document)
    tagged_words = pos_tag(word_token)
    ne_tagged = ne_chunk(tagged_words)
    
    return(ne_tagged)

In [38]:
def stanford_tagger(document):
    lst_word = []
    lst_ne = []
    lst_ps = []
    java_path = ("C:/Program Files/Java/jdk-15.0.1/bin/java.exe")
    os.environ['JAVAHOME'] = java_path

    jar = ('D:/Program/stanford-ner-4.0.0/stanford-ner.jar')
    model = ('D:/Program/stanford-ner-4.0.0/classifiers/english.all.3class.distsim.crf.ser') # 3 classes
    st = StanfordNERTagger(model, jar, encoding = 'utf-8')
    
    word_token = word_tokenize(document)
    classified_text = st.tag(word_token)
    
    for i in range(0, len(classified_text) - 1):
        if str(classified_text[i][1]) != 'O':
            lst_word.append(str(classified_text[i][0]))
            lst_ne.append(str(classified_text[i][1]))
    st_df = pd.DataFrame({'word': lst_word, 'ne': lst_ne})
    ps_df = st_df[st_df['ne'] == 'PERSON']
    
    for w in st_df['word']:
        lst_ps.append(w)
            # print(str(classified_text[i][0]), '>>',  str(classified_text[i][1]))
    return lst_ps, st_df

ps_st, st_df = stanford_tagger(text)

**NE Type and Examples**
* ORGANIZATION - Georgia-Pacific Corp., WHO
* PERSON - Eddy Bonte, President Obama
* LOCATION - Murray River, Mount Everest
* DATE - June, 2008-06-29
* TIME - two fifty a m, 1:30 p.m.
* MONEY - 175 million Canadian Dollars, GBP 10.40
* PERCENT - twenty pct, 18.75 %
* FACILITY - Washington Monument, Stonehenge
* GPE - South East Asia, Midlothian

In [44]:
def nltk_tagger(document):
    lst_word = []
    lst_ne = []
    lst_ps = []
    word_token = word_tokenize(document)
    tagged_words = pos_tag(word_token)
    ne_tagged = ne_chunk(tagged_words, binary = False) # False for details NE type
    # ne_tagged.draw()
    for chunk in ne_tagged:
        if hasattr(chunk, 'label'):
            if chunk.label() == 'PERSON':
                lst_ps.append(chunk[0][0])
            elif chunk.label() != 'PERSON':
                lst_word.append(chunk[0][0])
                lst_ne.append(chunk.label())
            #  print(chunk[0][0], '>>', chunk.label())
        if not hasattr(chunk, 'label'):
            if str(chunk[0:][1:][0]) == 'CD':
                lst_word.append(chunk[0])
                lst_ne.append(chunk[0:][1:][0])
                # print(chunk[0], '>>', chunk[0:][1:][0])
    nltk_df = pd.DataFrame({'word': lst_word, 'ne': lst_ne})
    return lst_ps, nltk_df

ps_nltk, nltk_df = nltk_tagger(text)

In [26]:
# nltk_df[nltk_df['ne'] == 'PERSON']

In [87]:
def person_check(stanford, nltk, nltk_df):
    lst_rp = []
    # lst_person = []
    word = ''
    for i in stanford:
        for j in nltk:
            if i == j:
                if i != word:
                    word = i
                    lst_rp.append(word)
    lst_person = ['PERSON'] * len(lst_rp)
    df = pd.DataFrame({'word': lst_rp, 'ne': lst_person})
    ne_st_nltk = nltk_df.append(df, sort = False)
    return lst_rp, ne_st_nltk

lst_rp, ne_st_nltk = person_check(ps_st, ps_nltk, nltk_df)

In [88]:
ne_st_nltk

Unnamed: 0,word,ne
0,Hello,GPE
1,Marina,GPE
2,one,CD
3,one,CD
4,one,CD
5,five,CD
6,four,CD
7,four,CD
8,two,CD
9,two,CD
