# NER Using 3 Models and Rules-based

In [1]:
# others libraries
import pandas as pd
from pprint import pprint
import re
import json

In [159]:
# NLTK and Stanford libraries
import nltk, re, os
import nltk.corpus
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.probability import FreqDist
from nltk.tag.stanford import StanfordNERTagger
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree
from nltk import RegexpParser
from nltk.chunk.api import ChunkParserI

In [3]:
# spaCy libraries
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

***

## Reading json file and storing in data frame

In [165]:
def read_load(path):
    # reading json file
    with open(path, 'r') as json_file:
        f = json.load(json_file)
    data = f
    
    # Collecting index of word, word, start time, and end time
    df = pd.DataFrame({'indx': ([X for X in range(len(data['values']['word']))]),
                       'word': data['values']['word'], 'start_time': data['values']['start'],
                       'end_time': data['values']['end']})
    
    df = df.set_index('indx')
    
    return data, df

In [658]:
data, df = read_load('D:/DSBA/Project/Final-Project-2/Nancy-Sandra.json')

***

## Stanford NER Tagger

It has 3 models

* 3 classes model for recognizing locations, person, and organizations
* 4 classes model for recognizing locations, person, organizations, and miscellaneous entities
* 7 classes model for recognizing locations, person, organizations, times, money, percents, and dates

In this project, we use 7 classes model

In [447]:
def Stanford_pred(dictt, df):
    
    java_path = ("C:/Program Files/Java/jdk-15.0.1/bin/java.exe")
    os.environ['JAVAHOME'] = java_path
    jar = ('D:/Program/stanford-ner-4.0.0/stanford-ner.jar')
    model = ('D:/Program/stanford-ner-4.0.0/classifiers/english.muc.7class.distsim.crf.ser') # 7 classes
    st = StanfordNERTagger(model, jar, encoding = 'utf-8')
    
    word_token = word_tokenize(dictt)
    classified_text = st.tag(word_token)

    wordlst = []
    ne_lst = []

    for i in range(len(classified_text)):
        if str(classified_text[i][1]) != 'O':
            if str(classified_text[i][1]) == 'PERSON' or str(classified_text[i][1]) == 'LOCATION' or str(classified_text[i][1]) == 'ORGANIZATION' or str(classified_text[i][1]) == 'MONEY' or str(classified_text[i][1]) == 'DATE':
                wordlst.append(str(classified_text[i][0]))
                ne_lst.append(str(classified_text[i][1]))
                
    st_pred = []        
    check = 0  

    for ww in df['word']:
        check = 0
        for w, n in zip(wordlst, ne_lst):
            if ww.__contains__(w):
                check = 1
                st_pred.append(str(n))
                break
        if check == 0:
            st_pred.append('O')
    
    df['stanford_pred'] = st_pred
    
    return st_pred, df

***

## NLTK

**NLTK recognizes the following entities:**
* ORGANIZATION - Georgia-Pacific Corp., WHO
* PERSON - Eddy Bonte, President Obama
* LOCATION - Murray River, Mount Everest
* DATE - June, 2008-06-29
* TIME - two fifty a m, 1:30 p.m.
* MONEY - 175 million Canadian Dollars, GBP 10.40
* PERCENT - twenty pct, 18.75 %
* FACILITY - Washington Monument, Stonehenge
* GPE - South East Asia, Midlothian

In [553]:
def NLTK_pred(dictt, df):
    
    word_token = word_tokenize(dictt)
    tagged_words = pos_tag(word_token)
    ne_tagged = ne_chunk(tagged_words, binary = False)

    lst_word = []
    lst_ne = []

    for chunk in ne_tagged:
        if hasattr(chunk, 'label'):
            if chunk.label() == 'PERSON' or chunk.label() == 'LOCATION' or chunk.label() == 'ORG' or chunk.label() == 'GPE' or chunk.label() == 'MONEY' or chunk.label() == 'DATE':
                if chunk.label() == 'ORG':
                    lst_word.append(chunk[0][0])
                    lst_ne.append('ORGANIZATION')
                if chunk.label() == 'LOC' or chunk.label() == 'GPE':
                    lst_word.append(chunk[0][0])
                    lst_ne.append('LOCATION')
                else:
                    lst_word.append(chunk[0][0])
                    lst_ne.append(chunk.label())
    
    nltk_pred = []        
    check = 0  

    for ww in df['word']:
        check = 0
        for w, n in zip(lst_word, lst_ne):
            if ww.__contains__(w):
                check = 1
                nltk_pred.append(str(n))
                break
        if check == 0:
            nltk_pred.append('O')
    
    df['nltk_pred'] = nltk_pred
    
    return nltk_pred, df

***

## spaCy

**spaCy recognizes the following entities:**
* PERSON - People, including fictional.
* NORP - Nationalities or religious or political groups.
* FAC - Buildings, airports, highways, bridges, etc.
* ORG - Companies, agencies, institutions, etc.
* GPE - Countries, cities, states.
* LOC - Non-GPE locations, mountain ranges, bodies of water.
* PRODUCT - Objects, vehicles, foods, etc. (Not services.)
* EVENT - Named hurricanes, battles, wars, sports events, etc.
* WORK_OF_ART - Titles of books, songs, etc.
* LAW - Named documents made into laws.
* LANGUAGE - Any named language.
* DATE - Absolute or relative dates or periods.
* TIME - Times smaller than a day.
* PERCENT - Percentage, including ”%“.
* MONEY - Monetary values, including unit.
* QUANTITY - Measurements, as of weight or distance.
* ORDINAL - “first”, “second”, etc.
* CARDINAL - Numerals that do not fall under another type.

In [554]:
def spaCy_pred(dictt, df):
    
    nlp = en_core_web_sm.load()
    # list of words that have named entities
    text = ([str(X) for X in nlp(dictt)
            if (X.ent_type_ != '' and X.ent_type_ != 'CARDINAL') & (str(X) != 'a') & (str(X) != 'good') & (str(X) != 'day') & (str(X) != '.') & (str(X) != ',')])
    # list of named entities
    ne = ([X.ent_type_ for X in nlp(dictt)
            if (X.ent_type_ != '' and X.ent_type_ != 'CARDINAL') & (str(X) != 'a') & (str(X) != 'good') & (str(X) != 'day') & (str(X) != '.') & (str(X) != ',')])
    
    sp_pred = []
    
    for n, i in enumerate(ne):
        if i == 'LOC':
            ne[n] = 'LOCATION'
        if i == 'GPE':
            ne[n] = 'LOCATION'
        if i == 'ORG':
            ne[n] = 'ORGANIZATION'
          
    check = 0  
    
    for ww in df['word']:
        check = 0
        for w, n in zip(text, ne):
            if ww.__contains__(w):
                check = 1
                sp_pred.append(str(n))
                break
        if check == 0:
            sp_pred.append('O')
                
    df['spacy_pred'] = sp_pred
                
    return sp_pred, df

***

## Combing Real Named Entities and Regular Expressions

In [651]:
def combined_models(df):
    
    # ------------ Selecting same predictions 2 of 3 models ------------
    
    i_twooth = []
    ne_twooth = []

    for i, st, nl, sp in zip(df.index, df['stanford_pred'], df['nltk_pred'], df['spacy_pred']):
        # check if stanford and nltk are same named entities
        if (st != 'O' and nl != 'O') and (str(st) == str(nl)):
            i_twooth.append(i)
            ne_twooth.append(str(st))
        # check if stanford and spacy are same named entities
        elif (st != 'O' and sp != 'O') and (str(st) == str(sp)):
            i_twooth.append(i)
            ne_twooth.append(str(st))
        # check if nltk and spacy are same named entities
        elif (nl != 'O' and sp != 'O') and (str(nl) == str(sp)):
            i_twooth.append(i)
            ne_twooth.append(str(nl))
        
    combined = []
    combined_check = 0
        
    for i in df.index:
        combined_check = 0
        for ii, n in zip(i_twooth, ne_twooth):
            if i == ii:
                combined_check = 1
                combined.append(str(n))
                break
        if combined_check == 0:
            combined.append('O')
       
    # ------------ Regular Expression checking ------------
    
    pii_index = []
    pii_type = []
    date_check = 0

    for i, num in zip(df.index, df['word']):
        date_check = 0
        for ii in i_twooth:
            if i == ii:
                date_check = 1
                break
        if date_check == 0:
            # ID card e.g. +666-666-666-6666
            if re.search('(\+?[0-9]{3,}-?[0-9]{3,}-?[0-9]{3,}-?[0-9]{4,})', num):
                pii_index.append(i)
                pii_type.append('IDCARD')
            # phone number e.g. 666-666-6666
            elif re.search('(\+?[0-9]{3,}-?[0-9]{3,}-?[0-9]{4,})', num):
                pii_index.append(i)
                pii_type.append('PHONENUM')
            # account number e.g. 666-666-666
            elif re.search('(\+?[0-9]{3,}-?[0-9]{3,}-?[0-9]{3,})', num):
                pii_index.append(i)
                pii_type.append('ACCNUM')
            # if not has punctuation
            elif re.search('[0-9]{9,}', num):
                pii_index.append(i)
                pii_type.append('PIINUM')
            
    regex_lst = []
    regex_check = 0
        
    for i in df.index:
        regex_check = 0
        for ii, pi in zip(pii_index, pii_type):
            if i == ii:
                regex_check = 1
                regex_lst.append(str(pi))
                break
        if regex_check == 0:
            regex_lst.append('O')

    # ------------ Combining real ents and regex ------------
            
    cb_rg = []

    for ent, regex in zip(combined, regex_lst):
        if ent != 'O' and regex == 'O':
            cb_rg.append(ent)
        elif regex != 'O' and ent == 'O':
            cb_rg.append(regex)
        else:
            cb_rg.append('O')
            
    df['real_ents'] = cb_rg
    
    return cb_rg, df

***

## Models Evaluation

In [401]:
ref_label = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/ref-nancy-sandra.csv')
ref_label = [i for i in ref_label['label']]

In [660]:
st_pred, df = Stanford_pred(data['transcript'], df)
nltk_pred, df = NLTK_pred(data['transcript'], df)
sp_pred, df = spaCy_pred(data['transcript'], df)

**Accuracy of the 3 models**

In [479]:
from nltk.metrics.scores import accuracy
st_acc = accuracy(ref_label, st_pred)
nltk_acc = accuracy(ref_label, nltk_pred)
spacy_acc = accuracy(ref_label, sp_pred)

print('Stanford Accuracy: %.2f' % (st_acc * 100) + '%')
print('NLTK Accuracy: %.2f' % (nltk_acc * 100) + '%')
print('spaCy Accuracy: %.2f' % (spacy_acc * 100) + '%')

Stanford Accuracy: 90.58%
NLTK Accuracy: 85.34%
spaCy Accuracy: 91.62%


### Specify named entity accuracy evaluation

In [559]:
def only_ENT(ent):
    
    person = []
    org = []
    loc = []
    cd = []
    date = []
    money = []
    
    for p in ent:
        if p != 'PERSON':
            person.append('O')
        else:
            person.append(str(p))
            
    for o in ent:
        if o != 'ORGANIZATION':
            org.append('O')
        else:
            org.append(str(o))
            
    for l in ent:
        if l != 'LOCATION':
            loc.append('O')
        else:
            loc.append(str(l))
            
    for d in ent:
        if d != 'DATE':
            date.append('O')
        else:
            date.append(str(d))
            
    for m in ent:
        if m != 'MONEY':
            money.append('O')
        else:
            money.append(str(m))
            
    return person, org, loc, date, money

In [558]:
def acc_each_ENT(r_ps, r_org, r_loc, r_date, r_money, st_ps, st_org, st_loc, st_date, st_money, nltk_ps, nltk_org, nltk_loc, nltk_date, nltk_money, sp_ps, sp_org, sp_loc, sp_date, sp_money):
    
    print('-------------------------------------------\n')
    print('PERSON DETECT ACCURACY:')
    print('Stanford Accuracy: %.2f' % (accuracy(r_ps, st_ps) * 100) + '%')
    print('NLTK Accuracy: %.2f' % (accuracy(r_ps, nltk_ps) * 100) + '%')
    print('spaCy Accuracy: %.2f' % (accuracy(r_ps, sp_ps) * 100) + '%')
    print('\n-------------------------------------------\n')

    print('ORGANIZATION DETECT ACCURACY:')
    print('Stanford Accuracy: %.2f' % (accuracy(r_org, st_org) * 100) + '%')
    print('NLTK Accuracy: %.2f' % (accuracy(r_org, nltk_org) * 100) + '%')
    print('spaCy Accuracy: %.2f' % (accuracy(r_org, sp_org) * 100) + '%')
    print('\n-------------------------------------------\n')

    print('LOCATION DETECT ACCURACY:')
    print('Stanford Accuracy: %.2f' % (accuracy(r_loc, st_loc) * 100) + '%')
    print('NLTK Accuracy: %.2f' % (accuracy(r_loc, nltk_loc) * 100) + '%')
    print('spaCy Accuracy: %.2f' % (accuracy(r_loc, sp_loc) * 100) + '%')
    print('\n-------------------------------------------\n')

    print('DATE DETECT ACCURACY:')
    print('Stanford Accuracy: %.2f' % (accuracy(r_date, st_date) * 100) + '%')
    print('NLTK Accuracy: %.2f' % (accuracy(r_date, nltk_date) * 100) + '%')
    print('spaCy Accuracy: %.2f' % (accuracy(r_date, sp_date) * 100) + '%')
    print('\n-------------------------------------------\n')

    print('MONEY DETECT ACCURACY:')
    print('Stanford Accuracy: %.2f' % (accuracy(r_money, st_money) * 100) + '%')
    print('NLTK Accuracy: %.2f' % (accuracy(r_money, nltk_money) * 100) + '%')
    print('spaCy Accuracy: %.2f' % (accuracy(r_money, sp_money) * 100) + '%')
    print('\n-------------------------------------------')

### Accuracies of specific named entities

In [480]:
r_ps, r_org, r_loc, r_date, r_money = only_ENT(ref_label)
st_ps, st_org, st_loc, st_date, st_money = only_ENT(st_pred)
nltk_ps, nltk_org, nltk_loc, nltk_date, nltk_money = only_ENT(nltk_pred)
sp_ps, sp_org, sp_loc, sp_date, sp_money = only_ENT(sp_pred)

In [481]:
acc_each_ENT(r_ps, r_org, r_loc, r_date, r_money, st_ps, st_org, st_loc, st_date, st_money, nltk_ps, nltk_org, nltk_loc, nltk_date, nltk_money, sp_ps, sp_org, sp_loc, sp_date, sp_money)

-------------------------------------------

PERSON DETECT ACCURACY:
Stanford Accuracy: 98.95%
NLTK Accuracy: 93.72%
spaCy Accuracy: 98.95%

-------------------------------------------

ORGANIZATION DETECT ACCURACY:
Stanford Accuracy: 97.38%
NLTK Accuracy: 98.95%
spaCy Accuracy: 97.38%

-------------------------------------------

LOCATION DETECT ACCURACY:
Stanford Accuracy: 96.86%
NLTK Accuracy: 95.29%
spaCy Accuracy: 96.86%

-------------------------------------------

CARDINAL NUMBER DETECT ACCURACY:
Stanford Accuracy: 96.86%
NLTK Accuracy: 96.34%
spaCy Accuracy: 97.91%

-------------------------------------------

DATE DETECT ACCURACY:
Stanford Accuracy: 100.00%
NLTK Accuracy: 98.43%
spaCy Accuracy: 100.00%

-------------------------------------------

MONEY DETECT ACCURACY:
Stanford Accuracy: 100.00%
NLTK Accuracy: 100.00%
spaCy Accuracy: 100.00%

-------------------------------------------


***

In [661]:
cb_rg, df = combined_models(df)
df.head(10)

Unnamed: 0_level_0,word,start_time,end_time,stanford_pred,nltk_pred,spacy_pred,real_ents
indx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,"Hello,",0.1,0.7,O,LOCATION,O,O
1,you,0.7,1.4,O,O,O,O
2,have,1.4,1.6,O,O,O,O
3,called,1.6,2.0,O,O,O,O
4,virtual,2.0,2.3,O,O,O,O
5,bank.,2.3,2.6,O,O,O,O
6,This,2.6,3.3,O,O,O,O
7,is,3.3,3.5,O,O,O,O
8,Nancy,3.5,3.9,PERSON,PERSON,PERSON,PERSON
9,speaking.,3.9,4.4,O,O,O,O


In [664]:
formal_ents = df.drop(['stanford_pred', 'nltk_pred', 'spacy_pred'], axis = 1)

In [665]:
formal_ents = formal_ents[formal_ents['real_ents'] != 'O']

In [666]:
formal_ents

Unnamed: 0_level_0,word,start_time,end_time,real_ents
indx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8,Nancy,3.5,3.9,PERSON
23,ATM,9.2,9.7,ORGANIZATION
26,ATM,10.3,10.8,ORGANIZATION
62,Sandra,31.4,31.8,LOCATION
70,July,34.0,35.2,DATE
71,7th.,35.2,35.9,DATE
72,1974.,35.9,37.4,DATE
82,New,43.6,44.3,LOCATION
83,York.,44.3,44.3,LOCATION
94,+558-976-652-3663.,50.4,57.3,IDCARD
