# Importing Modules and Creating Instance

## Summary 

- DATA USED - Naviga_Entity_Extraction_v002.xlsx


- FILES CREATED - locations_norm2raw_v036, locations_raw2norm_v036, organizations_norm2raw_v036, organizations_raw2norm_v036, persons_norm2raw_v036, persons_raw2norm_v036
               

- Matching algorithms of notebook 'entity_normalization_ver_0_3_6' were combined in a script file Normalization_Helper.py and used, Co Document Ratio was not used



In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import json
import math
import copy
import random
from collections import defaultdict, Counter, OrderedDict
from itertools import combinations
import unicodedata
import re
import matplotlib.pyplot as plt
import pickle

In [3]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
import wikipedia as wiki
import Levenshtein as lvst
from metaphone import doublemetaphone as dmt_phone
import networkx as nx
import nxviz as nv
from sparse_dot_topn import awesome_cossim_topn as cossim_topn

In [5]:
import spacy
from spacy import displacy
from spacy.symbols import nsubj, VERB, NOUN
from spacy.tokens import Doc, Token, Span
from spacy.attrs import ENT_IOB, ENT_TYPE
from spacy.tokenizer import Tokenizer

In [6]:
df=pd.read_excel("Naviga_Entity_Extraction_v002.xlsx")
df = df.fillna("")

In [7]:
persons= df["Person_Naviga"]

In [8]:
company = df["Company_Naviga"]

In [9]:
location = df["Location_Naviga"]

In [10]:
persons

0               Kobe Bryant
1              LeBron James
2            Michael Jordan
3               Bill Russel
4              Robert Horry
                ...        
27836          Seth Goodman
27837            Gary Davis
27838            Doug Finke
27839         Zoya Akhtar's
27840    Arundhuti Banerjee
Name: Person_Naviga, Length: 27841, dtype: object

# Methodology
In this section, I explore several approaches to normalize entity names. Experiments is being done for *Person names*.

- **Matching Algorithms** (Ensemble or Hierarchical)
    - String simiarity: Levenshtein distance, Jaro-Winkler distance
    - Normalizing to ASCII (for non-english character-encoded names)
    - Removing special characters and white spaces
    - Doublemetaphone matching
    - Word sorting by first letters of chunk (for example, William Telle &rarr; Telle William)
    - Subword-level TF-IDF expansion and consine-simliarity
    - Abbreviation/Acronym matching by Wikipedia data investigation and linking
    - Expanding to normal words (e.g. Stephen W. Hawking &rarr; Stephen Willam Hawking) - need to investigate HOW to

- **Use ensemble method**
    - Apply different weight schemes for different entity-types

- **Find normalzed form of names**
    - detect connected components
    - use distance-median, etc.

# Matching Algos

In [11]:
prefixes1 = ['Mr.', 'Mrs.', 'Ms.', 'Miss', 'Dr.', 'Sir', 'Sir.', "Ma'am", 'Maddam']
prefixes2 = ['President', 'Gen.', 'Prime Minister', 'Director', 'Deputy Director', 'Army Gen.'
            , 'General',  'Secretary', 'Treasury Secretary', 'Major General', 'Major Gen.'
            , 'Colonel', 'Lieutenant Colonel', 'LTG', 'Minister', 'Ambassador', 'Premier'
            , 'Management MG', 'MG', 'Lieutenant General', 'Defence Minister', 'Admiral'
            , 'General Manager', 'Party Secretary', 'Speaker', 'Defense Minister'
            , 'Defence Secratery', 'Defense Secratery', 'Marshall', 'Vice President'
            ]
prefixes1_lower = [p.lower() for p in prefixes1]
prefixes2_lower = [p.lower() for p in prefixes2]

prefixes_all = prefixes1 + prefixes2 + prefixes1_lower + prefixes2_lower

for pf in prefixes_all:
    if pf[-1] == '.':
        prefixes_all.append(pf[:-1])
        
for pf in prefixes_all:
    if len(pf.split()) > 1:
        prefixes_all.extend(pf.split())

prefixes_all = list(set(prefixes_all))
prefixes_all.sort(key=len, reverse=True)

               
print(prefixes_all)

['Lieutenant Colonel', 'treasury secretary', 'lieutenant general', 'Treasury Secretary', 'Lieutenant General', 'lieutenant colonel', 'defense secratery', 'Defence Secratery', 'Defense Secratery', 'defence secratery', 'defence minister', 'Defence Minister', 'defense minister', 'Defense Minister', 'deputy director', 'Party Secretary', 'party secretary', 'General Manager', 'Deputy Director', 'general manager', 'vice president', 'Prime Minister', 'Vice President', 'prime minister', 'major general', 'Management MG', 'Major General', 'management mg', 'ambassador', 'Major Gen.', 'lieutenant', 'management', 'Ambassador', 'Management', 'Lieutenant', 'major gen.', 'Secratery', 'Secretary', 'army gen.', 'secretary', 'Army Gen.', 'president', 'President', 'Major Gen', 'major gen', 'secratery', 'army gen', 'Marshall', 'Army Gen', 'Treasury', 'Minister', 'director', 'treasury', 'minister', 'marshall', 'Director', 'Colonel', 'Defence', 'general', 'General', 'Manager', 'Speaker', 'Premier', 'Admiral',

### TF-IDF cosine similarity

In [12]:
def generate_ngrams(data, n=3, captialized_1st=True, prefix2remove=prefixes_all):
    '''
    generate subword-level ngrams
    '''
    # dealing non-ASCII characters
    normal = unicodedata.normalize('NFKD', data).encode('ASCII', 'ignore')
    val = normal.decode("utf-8") # no lovercasing yet
    
    # remove prefix
    if prefix2remove:  
        for prefix in prefix2remove:
            val = val.replace(prefix, '')
            #val = val[1:] if val[0] == ' ' else val # remove any starting white space
    
    # lowercasing
    val = val.lower()
    # remove special characters
    val = re.sub('[^A-Za-z0-9 ]+', ' ', val)
    # remove multiple spaces
    val = re.sub(' +', ' ', val)
    
    #print(val)
    
    # Capitalized 1st letter of every word
    if captialized_1st: 
        val = val.title()
    # padding
    padding = ' '
    val = padding + val + padding
    #string = re.sub(r'[,-./]|\sBD',r'', string)
    
    ngrams = zip(*[val[i:] for i in range(n)])
    
    return [''.join(ngram) for ngram in ngrams]

In [48]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=generate_ngrams)
tf_idf_matrix = vectorizer.fit_transform(persons)

In [49]:
tf_idf_matrix.shape

(27841, 10132)

In [50]:
similarity_matrix = cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), 2, 0.01, use_threads=True, n_jobs=4)

In [13]:
def get_matches_df(similarity_matrix, A, B):
        '''
        Takes a matrix with similarity scores and two arrays, A and B,
        as an input and returns the matches with the score as a dataframe.
        Args:
            similarity_matrix (csr_matrix)  : The matrix (dimensions: len(A)*len(B)) with the similarity scores
            A              (pandas.Series)  : The array to be matched (dirty)
            B              (pandas.Series)  : The baseline array (clean)
        Returns:
            pandas.Dataframe : Array with matches between A and B plus scores
        '''
        non_zeros = similarity_matrix.nonzero()

        sparserows = non_zeros[0]
        sparsecols = non_zeros[1]

        nr_matches = sparsecols.size

        in_text = np.empty([nr_matches], dtype=object)
        matched = np.empty([nr_matches], dtype=object)
        similarity = np.zeros(nr_matches)

        in_text = np.array(A)[sparserows]
        matched = np.array(B)[sparsecols]
        similarity = np.array(similarity_matrix.data)

        df_tuples = list(zip(in_text, matched, similarity))

        return pd.DataFrame(df_tuples, columns=['in_text', 'matched', 'similarity'])

In [14]:
def remove_duplicated_match(df, col1='in_text', col2='matched'):
    '''
    generate a temporary column of "pair" and use it for de-duping and remove it
    '''
    temp = list()
    names1 = df[col1].tolist()
    names2 = df[col2].tolist()
    for tp in zip(names1, names2):
        temp.append(str(sorted(list(tp))))
    
    df['pair'] = temp
    df.drop_duplicates(subset="pair", inplace=True)
    df.drop(columns=['pair'], axis=1, inplace=True)
    
    return df

In [15]:
def get_simliar_names_by_tfdif(names, topN=2, threshold=0.7, remove_selfMatch=True, verbose=0):
    '''
    To Do: function description
    '''
    # get subword-level tf-idf features for each names
    vectorizer = TfidfVectorizer(min_df=1, analyzer=generate_ngrams)
    tf_idf_mat = vectorizer.fit_transform(names)
    if verbose:
        print('tf_idf_mat size: {}'.format(tf_idf_mat.shape))
    
    # get top N simliar names by consine simliarity
    similarity_mat = cossim_topn(tf_idf_mat, tf_idf_mat.transpose(),
                                 topN, use_threads=True, n_jobs=4)
    
    # get dataframe of matched result
    df_matched = get_matches_df(similarity_mat, pd.Series(names), pd.Series(names))
    if verbose:
        print('df_matched_raw size: {}'.format(df_matched.shape))
    
    # filter by simliarity threshold and remove self-matching
    df_matched = df_matched[(df_matched.similarity > threshold)]
    
    # remove self-matching:
    if remove_selfMatch:
        df_matched = df_matched[(df_matched.in_text != df_matched.matched)]
    
    # retain only one copy of matched pairs
    #df_matched.drop_duplicates(subset="similarity", inplace=True)
    df_matched = remove_duplicated_match(df_matched)
    
    # reverse sort by simliarty
    df_matched.sort_values(by=['similarity'], ascending=False, inplace=True)
    
    if verbose:
        print('df_matched_final size: {}'.format(df_matched.shape))
    
    return df_matched 

In [54]:
df_test = get_simliar_names_by_tfdif(persons, threshold=0.7)
df_test.head(20)

Unnamed: 0,in_text,matched,similarity
25855,Barbara Broccoli,BARBARA BROCCOLI,1.0
39192,Ms Choi,President Choi,1.0
17333,CONDADO DE HILLSBOROUGH,Condado de Hillsborough,1.0
39096,Sidney Madden,SIDNEY MADDEN,1.0
307,Beaver Dam,BEAVER DAM,1.0
39092,Ari Shapiro,ARI SHAPIRO,1.0
48886,CHRISTIAN LENGES,Christian Lenges,1.0
30713,JEREMIAH JOHNSON,Jeremiah Johnson,1.0
24447,Ms. Verma,Dr. Verma,1.0
1437,Pero la,Pero La,1.0


In [55]:
df_test.shape

(2588, 3)

### First-letter matching

In [16]:
def normalize_unicode_to_ascii(data, lowercasing=True):
    normal = unicodedata.normalize('NFKD', data).encode('ASCII', 'ignore')
    val = normal.decode("utf-8")
    if lowercasing:
        val = val.lower()
    # remove special characters
    val = re.sub('[^A-Za-z0-9 ]+', ' ', val)
    # remove multiple spaces
    val = re.sub(' +', ' ', val)
    return val

In [17]:
def first_letters(word, sorting=False, prefix2remove=prefixes_all):
    if prefix2remove:  
        for prefix in prefix2remove:
            word = word.replace(prefix, '')
        #word = word[1:] if word[0] == ' ' else word # remove any starting white space
    #print(word)
    val = normalize_unicode_to_ascii(word, lowercasing=False)      
    parts = val if val.isupper() and ' ' not in val else val.split() # don't split if val itself is an acronym
    
    res = [p[0].lower() for p in parts]
    if sorting:
        res.sort()
    
    return "".join(res)

In [18]:
def get_1st_letters_match(A_iter, B_iter, sorting=False, partial_match=False, prefix2remove=prefixes_all):
    '''
    param:
        A_iter, B_iter: iterable with same length
    return:
        list of boolean values
    '''
    assert len(A_iter) == len(B_iter)
    
    res = list()
    for tp in zip(A_iter, B_iter):
        first_letters_A = first_letters(tp[0]
                                        , sorting=sorting
                                        , prefix2remove=prefix2remove
                                       )
        
        first_letters_B = first_letters(tp[1]
                                        , sorting=sorting
                                        , prefix2remove=prefix2remove
                                       )

        if len(first_letters_B) < len(first_letters_A):
            first_letters_A, first_letters_B = first_letters_B, first_letters_A
        
        #print(first_letters_A)
        #print(first_letters_B)
        
        if partial_match:
            if first_letters_A in first_letters_B:
                res.append(True)
            else:
                res.append(False)
        else:
            if first_letters_A == first_letters_B:
                res.append(True)
            else:
                res.append(False)
    
    return res

In [63]:
first_letter_match = get_1st_letters_match(df_test.in_text, df_test.matched, sorting=True)

In [64]:
df_test["first_letter_match"] = first_letter_match

In [65]:
df_test.head()

Unnamed: 0,in_text,matched,similarity,first_letter_match
25855,Barbara Broccoli,BARBARA BROCCOLI,1.0,True
39192,Ms Choi,President Choi,1.0,True
17333,CONDADO DE HILLSBOROUGH,Condado de Hillsborough,1.0,True
39096,Sidney Madden,SIDNEY MADDEN,1.0,True
307,Beaver Dam,BEAVER DAM,1.0,True


### Co-document ratio

In [19]:
def co_document_ratio(word_pair, entity_type
                      , entity_subtypes=[], origin=entity_origin
                      , corpusInfo_dict = discovery_res
                      , by_file=True
                     ):
    
    assert entity_type != None
    assert type(entity_subtypes) == list
    
    name1, name2 = word_pair
    origin_name1 = list()
    origin_name2 = list()
    
    if entity_subtypes:
        for subtype in entity_subtypes:
            origin_name1.extend(origin[name1][(entity_type, subtype)])
            origin_name2.extend(origin[name2][(entity_type, subtype)])
    else: # subtypes not designated
        keys1 = [key for key in origin[name1].keys() if key[0] == entity_type]
        keys2 = [key for key in origin[name2].keys() if key[0] == entity_type]
        
        for key in keys1:
            origin_name1.extend(origin[name1][key])
        
        for key in keys2:
            origin_name2.extend(origin[name2][key])
    
    doc_ids_1 = set()
    for seg_id in origin_name1:
        if by_file:
            # this is .pdf filename, i.e. document name
            filename = corpusInfo_dict[seg_id]['filename']

        else: 
            # this is .json filename i.e. paragraph name
            filename = corpusInfo_dict[seg_id]['extracted_metadata']['filename']
        
        doc_ids_1.add(filename)
        
    doc_ids_2 = set()
    for seg_id in origin_name2:
        if by_file:
            # this is .pdf filename, i.e. document name
            filename = corpusInfo_dict[seg_id]['filename']

        else: 
            # this is .json filename i.e. paragraph name
            filename = corpusInfo_dict[seg_id]['extracted_metadata']['filename']
        
        doc_ids_2.add(filename)
    
    #doc_ids_1=set([s.split('_')[0] for s in origin_name1])
    #doc_ids_2=set([s.split('_')[0] for s in origin_name2])
    
    doc_ids_common = doc_ids_1.intersection(doc_ids_2)
    
    #print(doc_ids_1)
    #print(doc_ids_2)
    #print(doc_ids_common)
    
    if not doc_ids_common:
        return 0.
    else:
        return len(doc_ids_common)/(len(doc_ids_1)+len(doc_ids_2))

NameError: name 'entity_origin' is not defined

In [None]:
def get_co_document_ratio(A_iter, B_iter, entity_type, entity_subtypes=[], origin=entity_origin, by_file=True):
    '''
    param:
        A_iter, B_iter: iterable with same length
    return:
        lists of co-document ratio 
    '''
    assert len(A_iter) == len(B_iter)
    
    res = list()
    for tp in zip(A_iter, B_iter):
        co_doc_ratio = co_document_ratio(tp
                                         , entity_type=entity_type
                                         , entity_subtypes=entity_subtypes
                                         , origin=origin
                                         , by_file=by_file
                                        )
        res.append(co_doc_ratio)
    
    return res

In [None]:
co_doc_ratio = get_co_document_ratio(df_test.in_text, df_test.matched, entity_type='Person', by_file=True)

In [None]:
df_test["co_doc_ratio"] = co_doc_ratio

In [None]:
df_test.head(20)

### Miscellaneous
- Nonlinear double-metaphone matching scores
- inverse-Levenshtein distance
- Jaro-Winker distance

In [20]:
def match_doublemetaphone(word_pair, normalize2Ascii=True, prefix2remove=prefixes_all):
    '''
    ToDo: function description
    '''
    w1, w2 = word_pair
    
    if prefix2remove:  
        for prefix in prefix2remove:
            w1 = w1.replace(prefix, '')
            w2 = w2.replace(prefix, '')
    
        # remove any starting white space
        #w1 = w1[1:] if w1[0] == ' ' else w1
        #w2 = w2[1:] if w2[0] == ' ' else w2
    
    match_types = ['ASCII_norm_match', 'strong_match', 'weak_match', 'minimal_match', 'no_match']
    
    if normalize2Ascii:
        w1, w2 = tuple(map(normalize_unicode_to_ascii, (w1, w2)))
        if w1 == w2:
            return (True, match_types[0], (None, None))
    else:
        w1, w2 = word_pair
    
    tp1 = dmt_phone(w1)
    tp2 = dmt_phone(w2)
    
    match = True
    if tp1[0] == tp2[0]: # primary_key match
        match_type = match_types[1]
    elif tp1[0] == tp2[1] or tp1[1] == tp2[0]: # secondary_key == primary_key or vise versa
        match_type = match_types[2]
    elif tp1[1] == tp2[1]: # secondary_key == secondary_key
        match_type = match_types[3]
    else:
        match, match_type = False, match_types[4]
    
    return (match, match_type, (tp1, tp2))

In [21]:
def sigmoid(x, a=1):
    return 1/(1 + np.exp(-a*x))

def dmtph_match_score(x):
    '''
    This function gives rewards for "ascii_norm_match" & "strong_match", but penalties for the rest
    i.e. non-linear weighting
    '''
    return sigmoid(20*(x-0.7))

In [22]:
def get_dmtph_match(A_iter, B_iter, prefix2remove=prefixes_all):
    '''
    param:
        A_iter, B_iter: iterable with same length
    return:
        list of match scores
    '''
    assert len(A_iter) == len(B_iter)
    
    match_types = ['ASCII_norm_match', 'strong_match', 'weak_match', 'minimal_match', 'no_match']
    match_types.reverse()
    x = np.linspace(0, 1, 5)

    res = list()
    for tp in zip(A_iter, B_iter):
        match, match_type, dummy = match_doublemetaphone(tp, prefix2remove=prefix2remove)
        match_score = dmtph_match_score(x[match_types.index(match_type)])
        res.append(match_score)
    
    return res

In [78]:
dmtph_match = get_dmtph_match(df_test.in_text, df_test.matched)

In [79]:
df_test["dmtph_match"] = dmtph_match

In [80]:
df_test.head()

Unnamed: 0,in_text,matched,similarity,first_letter_match,dmtph_match
25855,Barbara Broccoli,BARBARA BROCCOLI,1.0,True,0.997527
39192,Ms Choi,President Choi,1.0,True,0.997527
17333,CONDADO DE HILLSBOROUGH,Condado de Hillsborough,1.0,True,0.997527
39096,Sidney Madden,SIDNEY MADDEN,1.0,True,0.997527
307,Beaver Dam,BEAVER DAM,1.0,True,0.997527


In [23]:
def get_string_distance(A_iter, B_iter, normalize=True, prefix2remove=prefixes_all):
    '''
    param:
        A_iter, B_iter: iterable with same length
    return:
        lists of invese-Levenshtein distance & Jaro-Winkler distance, respectively 
    '''
    assert len(A_iter) == len(B_iter)
    
    res_inv_lvst = list()
    res_jw = list()
    for w1, w2 in zip(A_iter, B_iter):
        if prefix2remove:  
            for prefix in prefix2remove:
                w1 = w1.replace(prefix, '')
                w2 = w2.replace(prefix, '')
        
            # remove any starting white space
            #w1 = w1[1:] if w1[0] == ' ' else w1
            #w2 = w2[1:] if w2[0] == ' ' else w2
        
        if normalize:
            w1, w2 = tuple(map(normalize_unicode_to_ascii, (w1, w2)))
        else:
            pass
        
        inv_lvst_dist = 1/lvst.distance(w1, w2) if lvst.distance(w1, w2) else 1 # avoid dividing by zero
        jw_dist = lvst.jaro_winkler(w1, w2)
        res_inv_lvst.append(inv_lvst_dist)
        res_jw.append(jw_dist)
    
    return res_inv_lvst, res_jw

In [82]:
inv_lvst_dist, jw_dist = get_string_distance(df_test.in_text, df_test.matched)

In [83]:
df_test["inv_lvst_dist"] = inv_lvst_dist
df_test["jw_dist"] = jw_dist

In [86]:
df_test.head()

Unnamed: 0,in_text,matched,similarity,first_letter_match,dmtph_match,inv_lvst_dist,jw_dist
25855,Barbara Broccoli,BARBARA BROCCOLI,1.0,True,0.997527,1.0,1.0
39192,Ms Choi,President Choi,1.0,True,0.997527,1.0,1.0
17333,CONDADO DE HILLSBOROUGH,Condado de Hillsborough,1.0,True,0.997527,1.0,1.0
39096,Sidney Madden,SIDNEY MADDEN,1.0,True,0.997527,1.0,1.0
307,Beaver Dam,BEAVER DAM,1.0,True,0.997527,1.0,1.0


In [88]:
# 2588

# Wrapper

All algo at once

In [24]:
def get_df_match(entity_names, entity_type, entity_subtypes=[]
                 , wiki_match=False
                 , prefix2remove=prefixes_all
                 , sorting_1st_letters=False
                 , partial_1st_letter_match=False
                 , co_doc_by_file=True       
                ):
    
    df = get_simliar_names_by_tfdif(entity_names)
    
    print("# of candidate pairs: {}".format(df.shape[0]))
    if wiki_match:
        df["wiki_match"] = get_wiki_match(df.in_text, df.matched)
    
    df["first_letter_match"] = get_1st_letters_match(df.in_text, df.matched
                                                     , sorting=sorting_1st_letters
                                                     , partial_match=partial_1st_letter_match
                                                     , prefix2remove=prefix2remove
                                                    )
    
#     df["co_doc_ratio"] = get_co_document_ratio(df.in_text, df.matched
#                                                , entity_type=entity_type
#                                                , entity_subtypes=entity_subtypes
#                                                , by_file=co_doc_by_file      
#                                               )
    
    df["dmtph_match"] = get_dmtph_match(df.in_text, df.matched, prefix2remove=prefix2remove)
    
    inv_lvst_dist, jw_dist = get_string_distance(df.in_text, df.matched, prefix2remove=prefix2remove)
    df["inv_lvst_dist"] = inv_lvst_dist
    df["jw_sim"] = jw_dist # in python Levenshtein library, this is actually simlarity ranging [0, 1]
    
    return df

In [90]:
df_test = get_df_match(persons, 'Person', sorting_1st_letters=True, partial_1st_letter_match=True)

# of candidate pairs: 2588


In [91]:
df_test.head()

Unnamed: 0,in_text,matched,similarity,first_letter_match,dmtph_match,inv_lvst_dist,jw_sim
25855,Barbara Broccoli,BARBARA BROCCOLI,1.0,True,0.997527,1.0,1.0
39192,Ms Choi,President Choi,1.0,True,0.997527,1.0,1.0
17333,CONDADO DE HILLSBOROUGH,Condado de Hillsborough,1.0,True,0.997527,1.0,1.0
39096,Sidney Madden,SIDNEY MADDEN,1.0,True,0.997527,1.0,1.0
307,Beaver Dam,BEAVER DAM,1.0,True,0.997527,1.0,1.0


# Weighted average of matchings


In [25]:
def get_ensemble_match(df, weights=[]):
    '''
    get a weighted average of matching scores
    '''
    row_num = df.shape[0]
    matchings = df.columns[2:]
    if weights:
        assert len(weights) == len(matchings)
        weights = np.array(weights)
    else:
        weights = np.ones(len(matchings))
    
    match_score = pd.Series(np.zeros(row_num), dtype='float64')
    for weight, matching in zip(weights, matchings):
        match_score += weight*df[matching]
    
    match_score = match_score/weights.sum()
    
    return match_score

In [26]:
def add_match_score(df, weights=[]):
    if 'match_score' in df.columns:
        df.drop(['match_score'], axis=1, inplace=True)
    df['match_score'] = get_ensemble_match(df, weights=weights)
    df.sort_values(by='match_score', ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)

In [94]:
weights_person = [4,2,1,1,1]
add_match_score(df_test, weights=weights_person)

In [96]:
df_test.head(50)

Unnamed: 0,in_text,matched,similarity,first_letter_match,dmtph_match,inv_lvst_dist,jw_sim,match_score
0,Beaver Dam,BEAVER DAM,1.0,True,0.9975274,1.0,1.0,0.999725
1,President Trump's,Mr. Trump's,1.0,True,0.9975274,1.0,1.0,0.999725
2,Pero la,Pero La,1.0,True,0.9975274,1.0,1.0,0.999725
3,Brian Mahoney,BRIAN MAHONEY,1.0,True,0.9975274,1.0,1.0,0.999725
4,Kim Jong-un,Kim Jong Un,1.0,True,0.9975274,1.0,1.0,0.999725
5,NADIA BOULANGER,Nadia Boulanger,1.0,True,0.9975274,1.0,1.0,0.999725
6,Gov DeWine,Gov. DeWine,1.0,True,0.9975274,1.0,1.0,0.999725
7,PLANT SALE,Plant Sale,1.0,True,0.9975274,1.0,1.0,0.999725
8,DEAR ABBY,Dear Abby,1.0,True,0.9975274,1.0,1.0,0.999725
9,Bill de Blasio's,Bill De Blasio's,1.0,True,0.9975274,1.0,1.0,0.999725


In [97]:
df_test.match_score.describe()

count    206.000000
mean       0.718828
std        0.175420
min        0.394210
25%        0.593098
50%        0.735240
75%        0.815751
max        0.999725
Name: match_score, dtype: float64

### cut off

In [27]:
def cut_off(df, q=None, lower_bound=0.0):
    assert lower_bound >= 0.0 and lower_bound <= 1.0
    if not lower_bound:
        if not q:
            return df[df.match_score > df.match_score.mean()]
        else:
            assert type(q) == float
            return df[df.match_score > df.match_score.quantile(q)]
    else: # given nonzero lowever bound, use it as a filter
        eps = 1e-10
        return df[df.match_score >= lower_bound-eps] # inclusive

In [99]:
df_match = cut_off(df_test)

In [100]:
df_test.shape

(2588, 8)

In [101]:
df_match.shape

(107, 8)

In [102]:
df_match.tail()

Unnamed: 0,in_text,matched,similarity,first_letter_match,dmtph_match,inv_lvst_dist,jw_sim,match_score
102,Benjamin Harris,Benjamin Hardin,0.779806,True,0.0001233946,0.5,1.0,0.735483
103,Robert Kennedy,Robert F. Kennedy,0.781836,True,0.0001233946,0.5,0.9875,0.734996
104,"Madeleine ""Maddie"" McCann",Madeleine McCann,0.866477,True,0.0001233946,0.142857,1.0,0.734321
105,Al Gore Sr,Al Gore,0.820867,True,0.0001233946,0.333333,0.97,0.731881
106,Joe Garcia,Joe L. Garcia,0.778947,True,8.31528e-07,0.5,0.966667,0.731384


# Normalizing algorithms

## Connected-Components

In [103]:
edge_list = list(df_match[["in_text", "matched", "match_score"]].itertuples(index=False, name=None))

In [104]:
edge_list[:5]

[('Beaver Dam', 'BEAVER DAM', 0.9997252640937074),
 ("President Trump's", "Mr. Trump's", 0.9997252640937072),
 ('Pero la', 'Pero La', 0.9997252640937072),
 ('Brian Mahoney', 'BRIAN MAHONEY', 0.9997252640937072),
 ('Kim Jong-un', 'Kim Jong Un', 0.9997252640937072)]

In [105]:
# generate graph
graph_ = nx.Graph()
graph_.add_weighted_edges_from(edge_list)

# extract connected components and reverse sort by length of components
connected_comp = [c for c in nx.connected_components(graph_)]
connected_comp.sort(key=len, reverse=True)

In [28]:
def get_connected_comp(df):
    edge_list = list(df[["in_text", "matched", "match_score"]].itertuples(index=False, name=None))
    graph_ = nx.Graph()
    graph_.add_weighted_edges_from(edge_list)

    # extract connected components and reverse sort by length of components
    connected_comp = [c for c in nx.connected_components(graph_)]
    connected_comp.sort(key=len, reverse=True)
    
    return connected_comp

In [107]:
# Definitely simple averaging over matchings are not optimal
connected_comp

[{"Mr. Trump's", "Mrs. Trump's", "President Trump's"},
 {"Bill De Blasio's", 'Bill de Blasio', "Bill de Blasio's"},
 {'Mr. Jones', 'Mrs. Jones', 'Ms. Jones'},
 {'Mr. Trump', 'Mrs. Trump', 'President Trump'},
 {'Madeleine "Maddie" McCann', 'Madeleine McCann', 'van Madeleine McCann'},
 {'BEAVER DAM', 'Beaver Dam'},
 {'Pero La', 'Pero la'},
 {'BRIAN MAHONEY', 'Brian Mahoney'},
 {'Kim Jong Un', 'Kim Jong-un'},
 {'NADIA BOULANGER', 'Nadia Boulanger'},
 {'Gov DeWine', 'Gov. DeWine'},
 {'PLANT SALE', 'Plant Sale'},
 {'DEAR ABBY', 'Dear Abby'},
 {'BEETHOVEN', 'Beethoven'},
 {'Mr. Carter', 'President Carter'},
 {'RIHANNA', 'Rihanna'},
 {'LADY GAGA', 'Lady Gaga'},
 {'JOE BIDEN', 'Joe Biden'},
 {'Dr Shah', 'Dr. Shah'},
 {'Dr. Cohen', 'Mr. Cohen'},
 {'STEVE MCQUEEN', 'Steve McQueen'},
 {'ALMA MAHLER', 'Alma Mahler'},
 {'BEEKMAN STREET ART FAIR', 'Beekman Street Art Fair'},
 {'BRENDA BETHUNE', 'Brenda Bethune'},
 {'ADIRONDACK WINE', 'Adirondack Wine'},
 {'TROY NIGHT OUT', 'Troy Night Out'},
 {'Drew

### Normalization
We choose the normalized term for each connected component
- Levenshtein-Median modifies original term (not desirable)
- length-based method sometimes yeilds multiple choices

After discussion with the dev team, we apply the followng rules to choose normalized terms

1. no period at the end
2. no hyphen anywhere
3. should start with a Uppercased letter
4. choose by string length (longest or shorted with repect to entity types)

I choose simply the first (or the last) term of each connected component as its normalized term.

In [29]:
def choose_normalized(connected_comp, choice='lvst'):
    '''
    choose one normalized term for each connected component by rule-based system
    NOTE: "longest = False" will choose the shortest
    '''
    res = list()
    for comp in connected_comp:
          
        # Rule-1: no period at the end
        temp = [name for name in comp if name[-1] != '.']
        comp = temp if temp else comp
        
        # Rule-2.1: no hyphen anywhere
        temp = [name for name in comp if '-' not in name]
        comp = temp if temp else comp
        
        # Rule-2.2: no '/' anywhere
        temp = [name for name in comp if '/' not in name]
        comp = temp if temp else comp
        
        # Rule-3: start with uppercase
        temp = [name for name in comp if name[0].isupper()]
        comp = temp if temp else comp
        
        # Rule-4: no unicode (no '\u2003')
        temp = []
        for name in comp:
            normal = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore')
            val = normal.decode("utf-8")
            temp.append(val)
            
        comp = list(set(temp)) if temp else comp     
        
        # Rule-5: choose by length or Levenshtein Median
        comp.sort(key=len)
        if choice == 'long':
            res.append(comp[-1])
        elif choice == 'short':
            res.append(comp[0])
        elif choice == 'lvst':
            res.append(lvst.median(comp))
        else:
            print("Not a valid choice methond")
            return None
    
    return res

In [110]:
normalized_terms = choose_normalized(connected_comp, choice='long')

In [111]:
len(normalized_terms)

102

In [112]:
normalized_terms[:50]

["President Trump's",
 "Bill De Blasio's",
 'Mrs. Jones',
 'President Trump',
 'Madeleine "Maddie" McCann',
 'BEAVER DAM',
 'Pero la',
 'Brian Mahoney',
 'Kim Jong Un',
 'Nadia Boulanger',
 'Gov. DeWine',
 'PLANT SALE',
 'DEAR ABBY',
 'BEETHOVEN',
 'President Carter',
 'Rihanna',
 'LADY GAGA',
 'JOE BIDEN',
 'Dr. Shah',
 'Mr. Cohen',
 'Steve McQueen',
 'ALMA MAHLER',
 'Beekman Street Art Fair',
 'Brenda Bethune',
 'Adirondack Wine',
 'Troy Night Out',
 "Drew Brees'",
 'Dr. Martin Luther King',
 'Fair Board President',
 'Kareem Abdul Jabar',
 'Brian Douglass',
 'Christiano Ronaldo',
 'Robert Cooke',
 'Breonne Taylor',
 'Valerie A. Arkoosh',
 'Don Nelson',
 'Antonio Guterres',
 'John McCrae',
 'Nic Davies',
 'Steven Mnuchin',
 'Susanna Clark',
 "Abraham Lincoln's",
 "Ahmaud Arbery's",
 "Noah Gragson's",
 "Christopher Nolan's",
 "Asa Hutchinson's",
 "Derek Chauvin's",
 "Denver Bazaar's",
 "Lin-Manuel Miranda's",
 "Michelle Lujan Grisham's"]

In [30]:
def make_norm_dict(components, norms):
    raw2normalized = dict()
    normalized2raw = defaultdict(list)
    for comp, norm in zip(components, norms):
        for name in comp:
            raw2normalized[name] = norm
            normalized2raw[norm].append(name)
    
    return raw2normalized, normalized2raw

In [116]:
raw2normalized, normalized2raw = make_norm_dict(connected_comp, normalized_terms)

In [117]:
print("# of raw (in-snippet) entiteis normalized: {}".format(len(raw2normalized)))
print("# of normalized keys: {}".format(len(normalized2raw)))

# of raw (in-snippet) entiteis normalized: 209
# of normalized keys: 102


# Entity Normalization

## Person

In [122]:
df_persons = get_df_match(persons, 'Person', wiki_match=False, sorting_1st_letters=True)

# of candidate pairs: 2588


In [123]:
weights_person = [4,2,1,1,1]
add_match_score(df_persons, weights=weights_person)

In [124]:
df_persons = cut_off(df_persons)

In [125]:
df_persons.shape

(69, 8)

In [126]:
df_persons.head(10)

Unnamed: 0,in_text,matched,similarity,first_letter_match,dmtph_match,inv_lvst_dist,jw_sim,match_score
0,Beaver Dam,BEAVER DAM,1.0,True,0.997527,1.0,1.0,0.999725
1,Ms. Jones,Mrs. Jones,1.0,True,0.997527,1.0,1.0,0.999725
2,Brian Mahoney,BRIAN MAHONEY,1.0,True,0.997527,1.0,1.0,0.999725
3,Kim Jong-un,Kim Jong Un,1.0,True,0.997527,1.0,1.0,0.999725
4,NADIA BOULANGER,Nadia Boulanger,1.0,True,0.997527,1.0,1.0,0.999725
5,Gov DeWine,Gov. DeWine,1.0,True,0.997527,1.0,1.0,0.999725
6,PLANT SALE,Plant Sale,1.0,True,0.997527,1.0,1.0,0.999725
7,DEAR ABBY,Dear Abby,1.0,True,0.997527,1.0,1.0,0.999725
8,President Carter,Mr. Carter,1.0,True,0.997527,1.0,1.0,0.999725
9,President Trump's,Mrs. Trump's,1.0,True,0.997527,1.0,1.0,0.999725


In [127]:
df_persons.tail(10)

Unnamed: 0,in_text,matched,similarity,first_letter_match,dmtph_match,inv_lvst_dist,jw_sim,match_score
59,William Lesh,William Leslie,0.711813,True,8.31528e-07,0.333333,1.0,0.686732
60,Kenny Porterfield,Kady Porterfield,0.76204,True,0.0001233946,0.333333,0.791702,0.685924
61,Tod von George Floyd,von George Floyds Tod,0.819859,True,8.31528e-07,0.111111,0.752381,0.682547
62,Mar&#237;a Blanco,Mar&#237;a Brenes,0.712363,True,0.0001233946,0.25,1.0,0.67773
63,Garret Smith,Garrett Smithley,0.723815,True,8.31528e-07,0.25,0.938889,0.676016
64,Calais Campbell,Chris Campbell,0.711167,True,0.0001233946,0.333333,0.834643,0.668085
65,Ben Crump,Benjamin Crump,0.735444,True,0.0001233946,0.2,0.851852,0.665972
66,Valerie Arkoosh,Valerie A. Arkoosh,0.935304,False,0.7310586,0.5,0.996078,0.66315
67,James Phillips,J. Phillips,0.71063,True,8.31528e-07,0.25,0.779286,0.652423
68,William Bryan,Bryan Williams,0.776557,True,8.31528e-07,0.083333,0.61485,0.644935


In [128]:
ver_string="2"
f_name = 'person_matching' + ver_string
df_persons.to_csv("corpus/" + f_name + ".csv")

In [129]:
persons_connected = connected_comp

In [130]:
len(persons_connected)

102

In [131]:
persons_raw2norm = raw2normalized
persons_norm2raw = normalized2raw

In [132]:
list(persons_raw2norm.items())[:5]

[("Mr. Trump's", "President Trump's"),
 ("President Trump's", "President Trump's"),
 ("Mrs. Trump's", "President Trump's"),
 ("Bill de Blasio's", "Bill De Blasio's"),
 ('Bill de Blasio', "Bill De Blasio's")]

In [133]:
list(persons_norm2raw.items())[:5]

[("President Trump's", ["Mr. Trump's", "President Trump's", "Mrs. Trump's"]),
 ("Bill De Blasio's",
  ["Bill de Blasio's", 'Bill de Blasio', "Bill De Blasio's"]),
 ('Mrs. Jones', ['Ms. Jones', 'Mr. Jones', 'Mrs. Jones']),
 ('President Trump', ['Mrs. Trump', 'President Trump', 'Mr. Trump']),
 ('Madeleine "Maddie" McCann',
  ['Madeleine McCann', 'van Madeleine McCann', 'Madeleine "Maddie" McCann'])]

In [134]:
data_path="corpus/"

In [135]:
f_name = 'persons_raw2norm' + ver_string
json_dp = json.dumps(persons_raw2norm)
with open(data_path + f_name + ".json","w") as f:
    f.write(json_dp)

f_name = 'persons_norm2raw' + ver_string
json_dp = json.dumps(persons_norm2raw)
with open(data_path + f_name + ".json","w") as f:
    f.write(json_dp)

# Company


In [31]:
df_company = get_df_match(company, 'Company', wiki_match=False, sorting_1st_letters=True)

# of candidate pairs: 3774


In [32]:
weights_company = [4,2,1,1,1]
add_match_score(df_company, weights=weights_company)

In [33]:
df_company = cut_off(df_company)

In [34]:
df_company.shape

(210, 8)

In [35]:
company_connected = get_connected_comp(df_company)

In [36]:
normalized_terms = choose_normalized(company_connected, choice='long')

In [37]:
raw2normalized, normalized2raw = make_norm_dict(company_connected, normalized_terms)

In [38]:
print("# of raw (in-snippet) entiteis normalized: {}".format(len(raw2normalized)))
print("# of normalized keys: {}".format(len(normalized2raw)))

# of raw (in-snippet) entiteis normalized: 400
# of normalized keys: 194


In [40]:
ver_string="2"
f_name = 'company_matching' + ver_string
df_company.to_csv("corpus/" + f_name + ".csv")

In [41]:
company_raw2norm = raw2normalized
company_norm2raw = normalized2raw

In [42]:
list(company_raw2norm.items())[:5]

[('World Health  Organization', "World Health Organization's"),
 ('World Health Organization', "World Health Organization's"),
 ("World Health Organization's", "World Health Organization's"),
 ('World Health Organisation', "World Health Organization's"),
 ('Cdc', 'CDC')]

In [43]:
list(company_norm2raw.items())[:5]

[("World Health Organization's",
  ['World Health  Organization',
   'World Health Organization',
   "World Health Organization's",
   'World Health Organisation']),
 ('CDC', ['Cdc', 'cdc', 'CDC']),
 ('M. G', ['M. G', 'M G', 'M.-G']),
 ('GrubHub', ['Grubhub', 'GRUBHUB', 'GrubHub']),
 ('Department of Treasury',
  ['Department of Defense',
   'Department of Treasury',
   'Department of Defence'])]

In [44]:
data_path="corpus/"

In [45]:
f_name = 'company_raw2norm' + ver_string
json_dp = json.dumps(company_raw2norm)
with open(data_path + f_name + ".json","w") as f:
    f.write(json_dp)

f_name = 'company_norm2raw' + ver_string
json_dp = json.dumps(company_norm2raw)
with open(data_path + f_name + ".json","w") as f:
    f.write(json_dp)

# Location

In [60]:
df_location = get_df_match(location, 'Location', wiki_match=False, sorting_1st_letters=True)

# of candidate pairs: 4622


In [61]:
weights_location = [4,2,1,1,1]
add_match_score(df_location, weights=weights_location)

In [62]:
df_location = cut_off(df_location)

In [63]:
df_location.shape

(379, 8)

In [64]:
location_connected = get_connected_comp(df_location)
connected_comp = location_connected

## post processing

In [65]:
def get_postProcessigUnits(connected_comp):
    connected_comp_short_ = sorted([list(comp) for comp in connected_comp if len(comp) == 2], key=lambda x:x[0])
    connected_comp_long_ = sorted([list(comp) for comp in connected_comp if len(comp) > 2], key=lambda x:x[0])
    connected_comp_all_ = sorted([list(comp) for comp in connected_comp], key=lambda x:x[0])
    
    connected_comp_short = [(i, comp) for i, comp in enumerate(connected_comp_short_)]
    connected_comp_long = [(i, comp) for i, comp in enumerate(connected_comp_long_)]
    connected_comp_all = [(i, comp) for i, comp in enumerate(connected_comp_all_)]
    
    return connected_comp_short, connected_comp_long, connected_comp_all 

In [66]:
connected_comp_short, connected_comp_long, connected_comp_all = get_postProcessigUnits(connected_comp)

In [67]:
print(len(connected_comp_short))
print(len(connected_comp_long))
print(len(connected_comp_all))

333
19
352


In [68]:
connected_comp_short[:20]

[(0, ['12th St.', '12th Street']),
 (1, ['13th St.', '13th Street']),
 (2, ['2nd District', '42nd District']),
 (3, ['4 Franklin Square', 'Franklin Square']),
 (4, ['4314 39th Ave.', '4314 39th Avenue']),
 (5, ['75th Avenue', '5th Avenue']),
 (6, ['ALABAMA', 'Alabama']),
 (7, ['ALBANY', 'Albany']),
 (8, ['ASIA', 'Asia']),
 (9, ['Alaska', 'ALASKA']),
 (10, ['American Legion Post 1', 'American Legion Post 10']),
 (11, ['American Legion Post 366', 'American Legion Post 333']),
 (12, ['American Legion Post 552', 'American Legion Post 502']),
 (13, ['Antwerp', 'Antwerpen']),
 (14, ['Arizona', 'ARIZONA']),
 (15, ['Arlington', 'ARLINGTON']),
 (16, ['Armenia', 'ARMENIA']),
 (17, ['Asia-Pacific', 'Asia Pacific']),
 (18, ['Atlanta', 'ATLANTA']),
 (19, ['Austin', 'AUSTIN'])]

In [69]:
len(connected_comp_long)

19

In [70]:
connected_comp_long

[(0, ['12th Ave. North', '12th Avenue North', '12th Ave. N']),
 (1, ['COLUMBUS', 'Columbus', "Columbus'"]),
 (2, ['Columbus Ohio', 'Columbus, Ohio', 'COLUMBUS, Ohio']),
 (3, ['Dhaka', 'DHaka', 'DHAKA']),
 (4, ['Fulton', 'FULTON', 'Fultondale']),
 (5, ['IRVINE, California', 'Irvine, California', 'IRVINE, Calif.']),
 (6, ["Illinois'", 'ILLINOIS', 'Illinois']),
 (7, ['JACKSON, Miss.', 'Jackson', 'JACKSON']),
 (8, ['Los Angeles, Ca.', 'LOS ANGELES, CA', 'Los Angeles, California']),
 (9, ['Middle East', 'Middle-east', 'Middle-East']),
 (10, ['NEW YORK', 'New York', 'New-York']),
 (11, ['Norfolk, Virginia', 'Norfolk, VA', 'NORFOLK, Va.']),
 (12, ['RICHMOND, Va', 'RICHMOND, VA', 'Richmond, VA']),
 (13, ['Saratoga county', 'SARATOGA COUNTY', 'Saratoga County']),
 (14, ['St. Louis', 'ST. LOUIS', 'St Louis']),
 (15, ["Texas'", 'Texas', 'TEXAS']),
 (16,
  ['WASHINGTON D.C.',
   'Washington D.C.',
   'Washington, D.C.',
   'WASHINGTON, D.C.']),
 (17, ['Washington DC', 'WASHINGTON DC', 'WASHINGTON,

In [75]:
# manual insepction to excude, connect & add
index2exclude = [4,7]
index2connect = [(16,17)]
comp2add = []

In [76]:
len(index2exclude)

2

In [77]:
def chunking(connected_comp_any, index2connect=[] ,index2exclude=[]):
    chunks = list()
    if index2connect:
        for tp in index2connect: 
            chunk = list()
            for idx in tp:
                chunk.extend(list(connected_comp_any[idx][1]))
                index2exclude.append(idx)

            chunks.append(chunk)
        
    return chunks, index2exclude

In [78]:
def get_comp_final(connected_comp, connected_comp_short, is_all=True
                   , *, index2connect=[], index2exclude=[], comp2add=[]):
    chunks, index2exclude = chunking(connected_comp, index2connect=index2connect, index2exclude=index2exclude)
    
    res = [list(a[1]) for i, a in enumerate(connected_comp) if i not in index2exclude]
    res.extend(chunks)
    res.extend(comp2add)
    
    if not is_all:
        res += [list(a[1]) for a in connected_comp_short]
    
    return res

In [79]:
connected_comp_final = get_comp_final(connected_comp_long, connected_comp_short, is_all=False
                                     , index2connect=index2connect
                                     , index2exclude=index2exclude
                                     , comp2add=comp2add
                                     )

In [80]:
print(len(connected_comp))
print(len(connected_comp_final))

352
349


In [81]:
location_connected = connected_comp_final

In [82]:
normalized_terms = choose_normalized(location_connected, choice='long')

In [83]:
raw2normalized, normalized2raw = make_norm_dict(location_connected, normalized_terms)

In [85]:
print("# of raw (in-snippet) entiteis normalized: {}".format(len(raw2normalized)))
print("# of normalized keys: {}".format(len(normalized2raw)))

# of raw (in-snippet) entiteis normalized: 719
# of normalized keys: 349


In [86]:
ver_string="2"
f_name = 'location_matching' + ver_string
df_location.to_csv("corpus/" + f_name + ".csv")

In [87]:
location_raw2norm = raw2normalized
location_norm2raw = normalized2raw

In [88]:
list(location_raw2norm.items())[:5]

[('12th Ave. North', '12th Avenue North'),
 ('12th Avenue North', '12th Avenue North'),
 ('12th Ave. N', '12th Avenue North'),
 ('COLUMBUS', "Columbus'"),
 ('Columbus', "Columbus'")]

In [93]:
list(location_norm2raw.items())[:5]

[('12th Avenue North',
  ['12th Ave. North', '12th Avenue North', '12th Ave. N']),
 ("Columbus'", ['COLUMBUS', 'Columbus', "Columbus'"]),
 ('COLUMBUS, Ohio', ['Columbus Ohio', 'Columbus, Ohio', 'COLUMBUS, Ohio']),
 ('DHAKA', ['Dhaka', 'DHaka', 'DHAKA']),
 ('Irvine, California',
  ['IRVINE, California', 'Irvine, California', 'IRVINE, Calif.'])]

In [94]:
data_path="corpus/"

In [95]:
f_name = 'location_raw2norm' + ver_string
json_dp = json.dumps(location_raw2norm)
with open(data_path + f_name + ".json","w") as f:
    f.write(json_dp)

f_name = 'location_norm2raw' + ver_string
json_dp = json.dumps(location_norm2raw)
with open(data_path + f_name + ".json","w") as f:
    f.write(json_dp)