Description: 
First we train a Bi-LSTM character level classifier which is used to classify the entire input string into one of 5 categories. After classification, we try to group the entity with the already present entities according to simialrity criterias. If the entity is not similar enough to the already present entities it will be recorded as a new entity and assigned a new group id.

In [1]:
!pip install tensorflow
!pip install pywikibot
!pip install jellyfish
!pip install rapidfuzz
!pip install autocorrect
!pip install spacy==2.2.4




In [2]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 15.0 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [3]:
!pip install tensorflow




In [36]:
import pywikibot
from rapidfuzz import fuzz 

import en_core_web_lg
import spacy
from spacy import displacy
from collections import Counter

from scipy import spatial
from autocorrect import Speller

import re

from googlesearch import search


In [5]:
import string
from random import randrange
import random  # used to create random separator
import string  # this is used to create the vocab of all the characters , stored in the variable 'vocab'
import pandas as pd
import numpy as np
from datetime import datetime
import os
import csv
import pickle

# below packages are used to create the model
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Bidirectional, TimeDistributed, Input, Masking
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [6]:
# max length of the input
maxlen=100

In [10]:
# load the csv data file
df=pd.read_csv('datafile_lstm.csv')

In [11]:
# shuffle data frame
df = df.sample(frac=1).reset_index(drop=True)

In [12]:
# split the dataframe into train and test
df1=df.iloc[100:500,:]
df2=df.iloc[:100,:]

In [13]:
# Dictionary contains the Labels
labels_dict = {  # we are keeping the value part as strings as it easy to generate labels , we could just give len(string)*label value to generate the label
    'company_name': 0,
    'company_address': 1,
    'serial_number': 2,
    'physical_good': 3,
    'location': 4
}

In [14]:
# create reverse dictionary of label placeholders
reversed_labels = {value: key for (key, value) in labels_dict.items()}
reversed_labels

{0: 'company_name',
 1: 'company_address',
 2: 'serial_number',
 3: 'physical_good',
 4: 'location'}

In [15]:
# define the vocab
vocab = list(string.whitespace + string.digits +
             string.ascii_lowercase + string.punctuation)  # total 74 characters

In [16]:
vocab_size=len(vocab)
vocab_size

74

In [17]:
# create the Tokenizer object, word_index contaning all the chars in the vocab
# basically this will encode all the characters into numbers, so each character defined in our vocab will be represented by a number
# vocab is just a list of tokens
tokenizer = Tokenizer(num_words=len(vocab), filters='',
                      char_level=True, oov_token='<oov>')
tokenizer.fit_on_texts(vocab)

In [18]:
#train
train_sentences=list(df1['text'])
train_labels=list(df1['category'])
train_sequences=tokenizer.texts_to_sequences(train_sentences)
train_padded=pad_sequences(train_sequences, padding='post',maxlen=maxlen, truncating='pre') #maxlen decides the maximum length of the sequence
train_labels=[labels_dict[item] for item in train_labels ]
train_labels = np.asarray(train_labels, dtype=np.int32)
train_label_one_hot = tf.keras.utils.to_categorical(train_labels, dtype='int32')


In [19]:
#test
test_sentences=list(df2['text'])
test_labels=list(df2['category'])
test_sequences=tokenizer.texts_to_sequences(test_sentences)
test_padded=pad_sequences(test_sequences, padding='post',maxlen=maxlen, truncating='pre') #maxlen decides the maximum length of the sequence
test_labels=[labels_dict[item] for item in test_labels ]
test_labels = np.asarray(test_labels, dtype=np.int32)
test_label_one_hot = tf.keras.utils.to_categorical(test_labels, dtype='int32')


In [20]:
# Create model
model = Sequential()
model.add(Embedding(input_dim=vocab_size+1, output_dim=32,
                    input_length=maxlen, mask_zero=True))
# vocab_size+1 is to inlcude the padding
# output_dim is the size of the embedding for the characters in the vocab
model.add(Bidirectional(LSTM(32, return_sequences=False)))
#model.add(Bidirectional(LSTM(32, return_sequences=True)))
# 5 is the number of labels and which we need in the softamx classifier
model.add(Dense(len(labels_dict), activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 32)           2400      
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                16640     
_________________________________________________________________
dense (Dense)                (None, 5)                 325       
Total params: 19,365
Trainable params: 19,365
Non-trainable params: 0
_________________________________________________________________


In [21]:
# create checkpoint
checkpoint_filepath = 'model_checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    save_best_only=False,
    monitor='val_accuracy',
    save_freq='epoch',
    mode='max',
    verbose=0,
    options=None
)
adam = keras.optimizers.Adam(learning_rate=0.001015)
model.compile(loss='categorical_crossentropy',
              optimizer=adam, metrics=['accuracy'])

history = model.fit(train_padded, train_label_one_hot,batch_size=32, epochs=10,  # TRAIN THE MODEL
                    validation_data=(test_padded, test_label_one_hot)#, callbacks=[model_checkpoint_callback]
                    )
model.save('model/intelligent-address-parser-model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Entity Disambiguation

In [22]:
# load the english spacy model
# this is used to get vectors for text
nlp = en_core_web_lg.load()

In [23]:
# setting up the tables where data will be stored
company_address_entity_table = pd.DataFrame({'entity':'','group_id':int()},index=[])

In [24]:
# sample serial numbers inital values
serial_number_entity_table = pd.DataFrame({'entity':'','group_id':int()},index=[])

serial_number_entity_table=pd.DataFrame({'entity':['abc12345','bhkdj9849204','zlljdoi9720483','bbnvnv909090'],
                                   'group_id':[0,1,2,3]})
serial_number_entity_table

Unnamed: 0,entity,group_id
0,abc12345,0
1,bhkdj9849204,1
2,zlljdoi9720483,2
3,bbnvnv909090,3


In [25]:
# sample physical goods initial values
physical_good_entity_table = pd.DataFrame({'entity':'','group_id':int()},index=[])

physical_good_entity_table=physical_good_entity_table.append({'entity':'plastic bottle','group_id':0,'vector':nlp('plastic bottle').vector},ignore_index=True)
physical_good_entity_table=physical_good_entity_table.append({'entity':'steel bowl','group_id':1,'vector':nlp('steel bowl').vector},ignore_index=True)
physical_good_entity_table=physical_good_entity_table.append({'entity':'leather sofa','group_id':2,'vector':nlp('leather sofa').vector},ignore_index=True)
physical_good_entity_table=physical_good_entity_table.append({'entity':'hardwood table','group_id':3,'vector':nlp('hardwood table').vector},ignore_index=True)

physical_good_entity_table

Unnamed: 0,entity,group_id,vector
0,plastic bottle,0,"[-0.289565, -0.049595, -0.077429, -0.15209301,..."
1,steel bowl,1,"[0.12778, 0.25831202, 0.45532, -0.372285, -0.1..."
2,leather sofa,2,"[-0.06037, -0.40972, -0.208975, -0.005081499, ..."
3,hardwood table,3,"[-0.091709, 0.09808999, -0.29916, -0.36364597,..."


In [26]:
# sample location initial value
location_entity_table = pd.DataFrame({'entity':'','group_id':int()},index=[])

location_entity_table=location_entity_table.append({'entity':'London','group_id':0,'vector':nlp('London, UK').vector},ignore_index=True)
location_entity_table=location_entity_table.append({'entity':'Rome, Italy','group_id':1,'vector':nlp('Rome, Italy').vector},ignore_index=True)
location_entity_table=location_entity_table.append({'entity':'Tokyo','group_id':2,'vector':nlp('Tokyo').vector},ignore_index=True)
location_entity_table=location_entity_table.append({'entity':'Japan','group_id':3,'vector':nlp('Japan').vector},ignore_index=True)

location_entity_table

Unnamed: 0,entity,group_id,vector
0,London,0,"[-0.040734004, 0.24897666, 0.082936674, -0.140..."
1,"Rome, Italy",1,"[0.041836005, 0.30547366, -0.11563143, -0.2124..."
2,Tokyo,2,"[0.28876, -0.55541, 0.083178, -0.19359, 0.3757..."
3,Japan,3,"[-0.44528, -0.17553, 0.075346, 0.0048481, 0.23..."


# serial_grouper

In [27]:
#decide to group or to add as new entity
#from rapidfuzz import fuzz

def serial_grouper(input_string):
    global serial_number_entity_table
    #input_string=sentence
    grouped_flag=0
    for index, row in serial_number_entity_table.iterrows():

        if re.findall("\A"+input_string[0:3], row['entity']): # check if the starting 3 characters are same, if yes then group them
            # save the entity in table and assign the same group id
            serial_number_entity_table=serial_number_entity_table.append({'entity':input_string, 'group_id':row['group_id']},ignore_index=True)
            print('entity grouped: ',{'entity':input_string, 'group_id':row['group_id']})
            grouped_flag=1
            break
        

    if grouped_flag==0:
        # save it as a new entity
        last=max(list(serial_number_entity_table['group_id']))
        serial_number_entity_table=serial_number_entity_table.append({'entity':input_string,'group_id':last+1},ignore_index=True)
        print('entity saved as new entry in the table: ', {'entity':input_string,'group_id':last+1})  
        
    return

#goods_grouper


In [28]:
# importing spelling corrector, we can easily use spelling correction as physical goods are very common and new names dont come in this
spell = Speller(lang='en')

In [29]:
def goods_grouper(input_entity):
    global physical_good_entity_table
    #input_entity='chair'
    input_entity=spell(input_entity) # spelling correction # physical goods have very general name and so we can easily apply spelling correction

    input_vector=nlp(input_entity).vector
    list_of_similarities=[]
    for index, row in physical_good_entity_table.iterrows():
        result = 1 - spatial.distance.cosine(row['vector'], input_vector )
        print(result)
        list_of_similarities.append(result)

    max_sim=max(list_of_similarities)
    print('max sim:',max_sim)
    if max_sim>0.72:
        group_id=list_of_similarities.index(max_sim)
        physical_good_entity_table=physical_good_entity_table.append({'entity':input_entity,'group_id':group_id,'vector':input_vector},ignore_index=True)
        print('entity grouped: ',{'entity':input_entity,'group_id':group_id})

    else:
        last=max(list(physical_good_entity_table['group_id']))
        physical_good_entity_table=physical_good_entity_table.append({'entity':input_entity,'group_id':last+1,'vector':input_vector},ignore_index=True)
        print('entity added: ',{'entity':input_entity,'group_id':last+1})


# location_grouper

vectors are helpful in reducing the search space, or you can say it helps to get candidates then we do string comparisions.

In [30]:
#spell = Speller(lang='en') # spelling correction was not working properly over here

def location_grouper(input_entity):
    global location_entity_table
    #input_entity='London, UK'
    #input_entity=spell(input_entity) # spelling correction # physical goods have very general name and so we can easily apply spelling correction

    input_vector=nlp(input_entity).vector
    list_of_similarities=[]
    candidates=[]
    for index, row in location_entity_table.iterrows():
        result = 1 - spatial.distance.cosine(row['vector'], input_vector )
        print(result)
        if result>0.70:
            candidates.append(index)
        #list_of_similarities.append(result)

    print('candidates:',candidates)

    if candidates:
        # prep input_entity for comparision by splitting it at spaces and
        input_entity_modified = input_entity.split(',')
        print('input_entity_modified',input_entity_modified)
        in_vec_list=[] # list of individual vectors for the entity parts
        for item in input_entity_modified:
            in_vec_list.append(nlp(item.strip()).vector)
            

        # for each candidate , replace comma with space and split at space, and then compare the vectors

        candidate_score_list=[]
        for item in candidates :
            can_string=location_entity_table['entity'][item] # replace comma with space
            can_string=can_string.split(',')

            print('can_string: ',can_string)
            can_vec_list=[] # list of individual vectors for the entity parts
            for i in can_string:
                can_vec_list.append(nlp(i.strip()).vector)

            # now we compare the elements from the 2 vector lists
            print('in_vec_list length',len(in_vec_list))
            print('can_vec_list length',len(can_vec_list))

            if len(in_vec_list)==1 and len(can_vec_list)==1:
                candidate_score_list.append(1 - spatial.distance.cosine(can_vec_list[0], in_vec_list[0]))

            elif len(in_vec_list)==1 and len(can_vec_list)==2:# first part is important
                candidate_score_list.append(1 - spatial.distance.cosine(can_vec_list[0], in_vec_list[0]))

            elif len(in_vec_list)==2 and len(can_vec_list)==1: # first part is important
                print('executing : in2 and can1')
                candidate_score_list.append(1 - spatial.distance.cosine(can_vec_list[0], in_vec_list[0]))

            elif len(in_vec_list)==2 and len(can_vec_list)==2: # least score wil determine 
                candidate_score_list.append(min([1 - spatial.distance.cosine(can_vec_list[0], in_vec_list[0]), 1 - spatial.distance.cosine(can_vec_list[1], in_vec_list[1])]))

            else:
                print('no criteria executed')


        print('candidate_score_list: ',candidate_score_list)
        max_sim=max(candidate_score_list)

        print('max sim:',max_sim)
        if max_sim>0.85:
            group_id=location_entity_table['group_id'][candidates[candidate_score_list.index(max_sim)]]
            location_entity_table=location_entity_table.append({'entity':input_entity,'group_id':group_id,'vector':input_vector},ignore_index=True)
            print('entity grouped: ',{'entity':input_entity,'group_id':group_id})

        else:
            last=max(list(location_entity_table['group_id']))
            location_entity_table=location_entity_table.append({'entity':input_entity,'group_id':last+1,'vector':input_vector},ignore_index=True)
            print('entity added: ',{'entity':input_entity,'group_id':last+1})

    else:
        last=max(list(location_entity_table['group_id']))
        location_entity_table=location_entity_table.append({'entity':input_entity,'group_id':last+1,'vector':input_vector},ignore_index=True)
        print('entity added: ',{'entity':input_entity,'group_id':last+1})

# company_grouper

In [37]:
def get_urls(tag, n, language):
    urls = [url for url in search(tag, stop=n, lang=language)]
    return urls

#get_urls('glassmkaing companies uk list',10,'en')

In [58]:
def find_aliases(input_company):
    #input_company='m&s'
    alias_list=[]
    try:
        link=get_urls(input_company+' company wikipedia page',1,'en')[0]
        print('link from google: ',link)
        if 'wikipedia' in link:
            link=re.sub('.+wiki\/','',link)
        print('cleaned name from link: ',link)
        input_company=link
        site = pywikibot.Site("en", "wikipedia")
        page = pywikibot.Page(site,input_company)
        item = pywikibot.ItemPage.fromPage(page)
        item_dict = item.get()
        alias_list=item_dict['aliases']['en']
    except Exception as e:
        print('error in google serp api: ',e)
        alias_list=[]


    return alias_list

In [55]:
def add_company_aliases_to_table(input_company,group_id):
    input_company=input_company.lower()
    global company_name_entity_table
    alias_list=[]
    flag=True

    try:

        alias_list=find_aliases(input_company)

        #check similarity of the input entit yand the aliases found for that entity to make sure we have the right page
        for i in alias_list:

            #print(input_company, i)

            if fuzz.partial_ratio(input_company, i.lower())>=90:
                flag=True
                print('wiki page found is relevant')
                break
            else:
                flag=False

        if flag==False:
            raise Exception('wiki page aliases didnt match so adding the enitity as new entry in table')

        if input_company not in alias_list:
            alias_list.append(input_company)
            
        for i in alias_list:
            company_name_entity_table=company_name_entity_table.append({'entity':i.lower(),'group_id':group_id},ignore_index=True)

    except Exception as e:
            print('error occurred: ',e)
            print('adding the entity as new entry')
            company_name_entity_table=company_name_entity_table.append({'entity':input_company,'group_id':group_id},ignore_index=True)

    return

In [49]:
company_name_entity_table = pd.DataFrame({'entity':'','group_id':int()},index=[])


In [50]:
# add values in the sample database
add_company_aliases_to_table(input_company='walmart',group_id=0)
add_company_aliases_to_table(input_company='tesco',group_id=1)
add_company_aliases_to_table(input_company='google',group_id=2)

link from google:  https://en.wikipedia.org/wiki/Walmart
cleaned name from link:  Walmart
walmart Wal-Mart
walmart Wal Mart
walmart Wal-Mart Stores, Inc.
walmart Walmart, Inc.
wiki page found is relevant
link from google:  https://en.wikipedia.org/wiki/Tesco
cleaned name from link:  Tesco
tesco Tesco PLC
wiki page found is relevant
link from google:  https://en.wikipedia.org/wiki/Google
cleaned name from link:  Google
google Google Inc.
wiki page found is relevant


In [51]:
company_name_entity_table

Unnamed: 0,entity,group_id
0,wal-mart,0
1,wal mart,0
2,"wal-mart stores, inc.",0
3,"walmart, inc.",0
4,walmart inc.,0
5,wallmart,0
6,wal-mart,0
7,wall mart,0
8,wall-mart,0
9,walmart,0


In [57]:
def company_grouper(input_company):
    input_company=input_company.lower()
    global company_name_entity_table
    flag=False
    for index, row in company_name_entity_table.iterrows():
        if fuzz.partial_ratio(input_company, row['entity'])>=90:
            #print(input_company, row['entity'])
            flag=True
            group_id=row['group_id']
            company_name_entity_table=company_name_entity_table.append({'entity':input_company,'group_id':group_id},ignore_index=True)
            print('entity grouped: ',{'entity':input_company,'group_id':group_id})
            break
        else:
            flag=False
            pass

    if flag==False:
        print('entity not present in the existing database, trying to find aliases on wikidata before adding it')
        last=max(list(company_name_entity_table['group_id']))
        add_company_aliases_to_table(input_company=input_company,group_id=last+1)

    return 



# classifier_main_function

In [44]:
def classifier_main_function(input_entity):
    global model
    global tokenizer
    global serial_number_entity_table
    global physical_good_entity_table
    global location_entity_table
    # prediction (this is the starting point of our full model)

    sentence = [input_entity] # convert into list
    sequences = tokenizer.texts_to_sequences(sentence)
    padded = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='pre')
    output_class=reversed_labels[np.argmax(model.predict(padded)[0], axis=-1)]
    print('the input entity is classified as: ', output_class)


    if output_class=='serial_number':
        serial_grouper(input_entity)
        print('-'*60)
        print(serial_number_entity_table)

    elif output_class=='physical_good':
        goods_grouper(input_entity)
        print('-'*60)
        print(physical_good_entity_table)

    elif output_class=='location':
        location_grouper(input_entity)
        print('-'*60)
        print(location_entity_table)
        
    elif output_class=='company_name':
        company_grouper(input_entity)
        print('-'*60)
        print(company_name_entity_table)

    return
    

In [56]:
# calling the master function
input_entity=input('enter entity')
classifier_main_function(input_entity)

the input entity is classified as:  company_name
entity not present in the existing database, trying to find aliases on wikidata before adding it
link from google:  https://en.wikipedia.org/wiki/IBM
cleaned name from link:  IBM
wiki page found is relevant
------------------------------------------------------------
                                         entity  group_id
0                                      wal-mart         0
1                                      wal mart         0
2                         wal-mart stores, inc.         0
3                                 walmart, inc.         0
4                                  walmart inc.         0
5                                      wallmart         0
6                                      wal-mart         0
7                                     wall mart         0
8                                     wall-mart         0
9                                       walmart         0
10                                    tesco p