# NER Using 3 Models and Rules-based

In [4]:
# others libraries
import pandas as pd
import re
import json

In [2]:
# NLTK and Stanford libraries
import nltk, re, os
import nltk.corpus
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.probability import FreqDist
from nltk.tag.stanford import StanfordNERTagger
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree
from nltk import RegexpParser
from nltk.chunk.api import ChunkParserI

In [3]:
# spaCy libraries
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

## Reading json file and storing in data frame

In [4]:
def read_load(path):
    # reading json file
    with open(path, 'r') as json_file:
        f = json.load(json_file)
    data = f
    
    # Collecting index of word, word, start time, and end time
    df = pd.DataFrame({'indx': ([X for X in range(len(data['values']['word']))]),
                       'word': data['values']['word'], 'start_time': data['values']['start'],
                       'end_time': data['values']['end']})
    
    df = df.set_index('indx')
    
    return data, df

***

# Named Entities Recognizer Process

## Stanford NER Tagger

In [5]:
def Stanford_pred(dictt, df):
    
    java_path = ("C:/Program Files/Java/jdk-15.0.1/bin/java.exe")
    os.environ['JAVAHOME'] = java_path
    jar = ('D:/Program/stanford-ner-4.0.0/stanford-ner.jar')
    model = ('D:/Program/stanford-ner-4.0.0/classifiers/english.muc.7class.distsim.crf.ser') # 7 classes
    st = StanfordNERTagger(model, jar, encoding = 'utf-8')
    
    word_token = word_tokenize(dictt)
    classified_text = st.tag(word_token)

    wordlst = []
    ne_lst = []

    for i in range(len(classified_text)):
        if str(classified_text[i][1]) != 'O':
            if str(classified_text[i][1]) == 'PERSON' or str(classified_text[i][1]) == 'LOCATION' or str(classified_text[i][1]) == 'ORGANIZATION' or str(classified_text[i][1]) == 'MONEY' or str(classified_text[i][1]) == 'DATE':
                wordlst.append(str(classified_text[i][0]))
                ne_lst.append(str(classified_text[i][1]))
                
    st_pred = []        
    check = 0  

    for ww in df['word']:
        check = 0
        for w, n in zip(wordlst, ne_lst):
            if ww.__contains__(w):
                check = 1
                st_pred.append(str(n))
                break
        if check == 0:
            st_pred.append('O')
    
    df['stanford_pred'] = st_pred
    
    return st_pred, df

## NLTK

In [6]:
def NLTK_pred(dictt, df):
    
    word_token = word_tokenize(dictt)
    tagged_words = pos_tag(word_token)
    ne_tagged = ne_chunk(tagged_words, binary = False)

    lst_word = []
    lst_ne = []

    for chunk in ne_tagged:
        if hasattr(chunk, 'label'):
            if chunk.label() == 'PERSON' or chunk.label() == 'LOCATION' or chunk.label() == 'ORG' or chunk.label() == 'GPE' or chunk.label() == 'MONEY' or chunk.label() == 'DATE':
                if chunk.label() == 'ORG':
                    lst_word.append(chunk[0][0])
                    lst_ne.append('ORGANIZATION')
                if chunk.label() == 'LOC' or chunk.label() == 'GPE':
                    lst_word.append(chunk[0][0])
                    lst_ne.append('LOCATION')
                else:
                    lst_word.append(chunk[0][0])
                    lst_ne.append(chunk.label())
    
    nltk_pred = []        
    check = 0  

    for ww in df['word']:
        check = 0
        for w, n in zip(lst_word, lst_ne):
            if ww.__contains__(w):
                check = 1
                nltk_pred.append(str(n))
                break
        if check == 0:
            nltk_pred.append('O')
    
    df['nltk_pred'] = nltk_pred
    
    return nltk_pred, df

## spaCy

In [9]:
def spaCy_pred(dictt, df):
    
    nlp = en_core_web_sm.load()
    # list of words that have named entities
    text = ([str(X) for X in nlp(dictt)
            if (X.ent_type_ != '' and X.ent_type_ != 'CARDINAL' and X.ent_type_ != 'PRODUCT') & (str(X) != 'a') & (str(X) != 'good') & (str(X) != 'day') & (str(X) != '.') & (str(X) != ',')])
    # list of named entities
    ne = ([X.ent_type_ for X in nlp(dictt)
            if (X.ent_type_ != '' and X.ent_type_ != 'CARDINAL' and X.ent_type_ != 'PRODUCT') & (str(X) != 'a') & (str(X) != 'good') & (str(X) != 'day') & (str(X) != '.') & (str(X) != ',')])
    
    sp_pred = []
    
    for n, i in enumerate(ne):
        if i == 'LOC':
            ne[n] = 'LOCATION'
        if i == 'GPE':
            ne[n] = 'LOCATION'
        if i == 'ORG':
            ne[n] = 'ORGANIZATION'
          
    check = 0  
    
    for ww in df['word']:
        check = 0
        for w, n in zip(text, ne):
            if ww.__contains__(w):
                check = 1
                sp_pred.append(str(n))
                break
        if check == 0:
            sp_pred.append('O')
                
    df['spacy_pred'] = sp_pred
                
    return sp_pred, df

## Combining Real Named Entities and Regular Expressions

In [7]:
def combined_models(df):
    
    # ------------ Selecting same named entity predictions 2 of 3 models ------------
    
    i_twooth = []
    ne_twooth = []

    for i, st, nl, sp in zip(df.index, df['stanford_pred'], df['nltk_pred'], df['spacy_pred']):
        # check if spacy predict 2 NE
        if (st == 'O' or nl == 'O') and (str(sp) == 'DATE' or str(sp) == 'PERSON'):
            i_twooth.append(i)
            ne_twooth.append(str(sp))
        # check if stanford and nltk are same named entities
        elif (st != 'O' and nl != 'O') and (str(st) == str(nl)):
            i_twooth.append(i)
            ne_twooth.append(str(st))
        # check if stanford and spacy are same named entities
        elif (st != 'O' and sp != 'O') and (str(st) == str(sp)):
            i_twooth.append(i)
            ne_twooth.append(str(sp))
        # check if nltk and spacy are same named entities
        elif (nl != 'O' and sp != 'O') and (str(nl) == str(sp)):
            i_twooth.append(i)
            ne_twooth.append(str(sp))
        
    combined = []
    combined_check = 0
        
    for i in df.index:
        combined_check = 0
        for ii, n in zip(i_twooth, ne_twooth):
            if i == ii:
                combined_check = 1
                combined.append(str(n))
                break
        if combined_check == 0:
            combined.append('O')
       
    # ------------ Regular Expression checking ------------
    
    pii_index = []
    pii_type = []
    date_check = 0

    for i, num in zip(df.index, df['word']):
        date_check = 0
        for ii in i_twooth:
            if i == ii:
                date_check = 1
                break
        if date_check == 0:
            # ID card e.g. +666-666-666-6666
            if re.search('(\+?[0-9]{3,}-?[0-9]{3,}-?[0-9]{3,}-?[0-9]{4,})', num):
                pii_index.append(i)
                pii_type.append('PIINUM')
            # phone number e.g. 666-666-6666
            elif re.search('(\+?[0-9]{3,}-?[0-9]{3,}-?[0-9]{4,})', num):
                pii_index.append(i)
                pii_type.append('PIINUM')
            # account number e.g. 666-666-666
            elif re.search('(\+?[0-9]{3,}-?[0-9]{3,}-?[0-9]{3,})', num):
                pii_index.append(i)
                pii_type.append('PIINUM')
            # card number
            elif re.search('(\+?[0-9]{2,}-?[0-9]{3,}-?[0-9]{3,}-?[0-9]+-?[0-9]+)', num):
                pii_index.append(i)
                pii_type.append('PIINUM')
            # if not has punctuation
            elif re.search('\+?[0-9]{9,}', num):
                pii_index.append(i)
                pii_type.append('PIINUM')
            
    regex_lst = []
    regex_check = 0
        
    for i in df.index:
        regex_check = 0
        for ii, pi in zip(pii_index, pii_type):
            if i == ii:
                regex_check = 1
                regex_lst.append(str(pi))
                break
        if regex_check == 0:
            regex_lst.append('O')

    # ------------ Combining real ents and regex ------------
            
    cb_rg = []

    for ent, regex in zip(combined, regex_lst):
        if ent != 'O' and regex == 'O':
            cb_rg.append(ent)
        elif regex != 'O' and ent == 'O':
            cb_rg.append(regex)
        else:
            cb_rg.append('O')
            
    df['real_ents'] = cb_rg
    
    return cb_rg, df

## Creating New Data Frame to Store Real Entities

In [8]:
def filter_ents(df):
    
    formal_ents = df.drop(['stanford_pred', 'nltk_pred', 'spacy_pred'], axis = 1)
    formal_ents = formal_ents[formal_ents['real_ents'] != 'O']
    
    return formal_ents

***