In [22]:
# pip install jellyfish

In [25]:
# pip install fuzzywuzzy

In [26]:
# pip install epitran

In [27]:
import pandas as pd
import numpy as np
import jellyfish
from fuzzywuzzy import fuzz
import re
import epitran

In [28]:
epi = epitran.Epitran('tgl-Latn')

In [54]:
input_bn = 'asia'
input_bn = re.sub(r'[^A-Za-z0-9 ]', '', input_bn.lower())
input_bn

'asia'

In [55]:
epi.transliterate(input_bn)

'ʔasia'

In [56]:
def clean_text(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

df_bn = pd.read_csv('./company.csv', on_bad_lines='skip')[['Company Name ']]
df_bn['Business Name'] = df_bn['Company Name '].str.lower().apply(clean_text)
df_bn['ipa'] = df_bn['Business Name'].apply(lambda x: epi.transliterate(x))

df_bn

Unnamed: 0,Company Name,Business Name,ipa
0,"8990 Holdings, Inc. Series B Perpetual Preferr...",8990 holdings inc series b perpetual preferred...,8990 holdiŋs ink seɾies b peɾpetual pɾefeɾɾed ...
1,"8990 Holdings, Inc. Series A Perpetual Preferr...",8990 holdings inc series a perpetual preferred...,8990 holdiŋs ink seɾies a peɾpetual pɾefeɾɾed ...
2,Asia Amalgamated Holdings Corporation,asia amalgamated holdings corporation,ʔasia amalɡamated holdiŋs koɾpoɾation
3,"Atok-Big Wedge Co., Inc.",atokbig wedge co inc,ʔatokbiɡ wedɡe ko ink
4,"AbaCore Capital Holdings, Inc.",abacore capital holdings inc,ʔabakoɾe kapital holdiŋs ink
...,...,...,...
282,"Wilcon Depot, Inc.",wilcon depot inc,wilkon depot ink
283,"Waterfront Philippines, Incorporated",waterfront philippines incorporated,wateɾfɾont philippines inkoɾpoɾated
284,Xurpas Inc.,xurpas inc,ksuɾpas ink
285,NexGen Energy Corp.,nexgen energy corp,neksɡen eneɾɡj koɾp


In [57]:

df_bn['levenshtein'] = df_bn['Business Name'].apply(lambda x: fuzz.ratio(input_bn, x))
df_bn['soundex'] = df_bn['Business Name'].apply(lambda x: fuzz.ratio(jellyfish.soundex(input_bn),
                                                                     jellyfish.soundex(x)))
df_bn['metaphone'] = df_bn['Business Name'].apply(lambda x: fuzz.ratio(jellyfish.metaphone(input_bn),
                                                                       jellyfish.metaphone(x)))
df_bn['epitran'] = df_bn['ipa'].apply(lambda x: fuzz.ratio(epi.transliterate(input_bn), x))
df_bn

Unnamed: 0,Company Name,Business Name,ipa,levenshtein,soundex,metaphone,epitran
0,"8990 Holdings, Inc. Series B Perpetual Preferr...",8990 holdings inc series b perpetual preferred...,8990 holdiŋs ink seɾies b peɾpetual pɾefeɾɾed ...,11,0,6,11
1,"8990 Holdings, Inc. Series A Perpetual Preferr...",8990 holdings inc series a perpetual preferred...,8990 holdiŋs ink seɾies a peɾpetual pɾefeɾɾed ...,11,0,12,11
2,Asia Amalgamated Holdings Corporation,asia amalgamated holdings corporation,ʔasia amalɡamated holdiŋs koɾpoɾation,20,50,15,24
3,"Atok-Big Wedge Co., Inc.",atokbig wedge co inc,ʔatokbiɡ wedɡe ko ink,17,50,12,23
4,"AbaCore Capital Holdings, Inc.",abacore capital holdings inc,ʔabakoɾe kapital holdiŋs ink,19,50,9,24
...,...,...,...,...,...,...,...
282,"Wilcon Depot, Inc.",wilcon depot inc,wilkon depot ink,10,25,0,10
283,"Waterfront Philippines, Incorporated",waterfront philippines incorporated,wateɾfɾont philippines inkoɾpoɾated,21,0,0,20
284,Xurpas Inc.,xurpas inc,ksuɾpas ink,43,25,0,38
285,NexGen Energy Corp.,nexgen energy corp,neksɡen eneɾɡj koɾp,0,25,0,8


In [58]:
df_bn.sort_values('levenshtein', ascending=False).reset_index(drop=True)[['Company Name ', 'levenshtein']].head(20)

Unnamed: 0,Company Name,levenshtein
0,"Paxys, Inc.",46
1,Xurpas Inc.,43
2,"ATN Holdings, Inc. ``A``",36
3,"Asian Terminals, Inc.",35
4,"Haus Talk, Inc.",35
5,"Jackstones, Inc.",33
6,"Vistamalls, Inc.",33
7,MacroAsia Corporation,32
8,A. Soriano Corporation,32
9,"AREIT, Inc.",31


In [59]:
df_bn.sort_values('soundex', ascending=False).reset_index(drop=True)[['Company Name ', 'soundex']].head(20)

Unnamed: 0,Company Name,soundex
0,"Asian Terminals, Inc.",50
1,Ayala Corporation Class ``B`` Series 3 Preferr...,50
2,"APC Group, Inc.",50
3,Ayala Corporation Class ``B`` Series 2 Preferr...,50
4,A. Soriano Corporation,50
5,Asia United Bank Corporation,50
6,AllHome Corp.,50
7,"Altus Property Ventures, Inc.",50
8,"Anchor Land Holdings, Inc.",50
9,"Apex Mining Co., Inc.",50


In [60]:
df_bn.sort_values('metaphone', ascending=False).reset_index(drop=True)[['Company Name ', 'metaphone']].head(20)

Unnamed: 0,Company Name,metaphone
0,Ayala Corporation,33
1,A. Soriano Corporation,29
2,Arthaland Corporation,27
3,ABS-CBN Corporation,27
4,Nickel Asia Corporation,27
5,"Asian Terminals, Inc.",25
6,"AREIT, Inc.",22
7,AllHome Corp.,20
8,Asia United Bank Corporation,20
9,Now Corporation,20


In [61]:
df_bn.sort_values('epitran', ascending=False).reset_index(drop=True)[['Company Name ', 'ipa', 'epitran']].head(20)

Unnamed: 0,Company Name,ipa,epitran
0,"ATN Holdings, Inc. ``A``",ʔatn holdiŋs ink a,43
1,"Paxys, Inc.",paksjs ink,40
2,"Asian Terminals, Inc.",ʔasian teɾminals ink,40
3,"AREIT, Inc.",ʔaɾeit ink,40
4,"Ionics, Inc.",ʔioniks ink,38
5,Xurpas Inc.,ksuɾpas ink,38
6,A. Soriano Corporation,ʔa soɾiano koɾpoɾation,37
7,"AllDay Marts, Inc.",ʔalldaj maɾts ink,36
8,"ATN Holdings, Inc. ``B``",ʔatn holdiŋs ink b,35
9,"Haus Talk, Inc.",haus talk ink,33
