In [1]:
import re
import pprint
company_name_raw = [
                'Fusemachines  INC',
                'FUSEMACHINES      NEPAL, INC.',
                'FUSEMACHINES',
                'fusemachines',
                'fuse org.',
                'fuse organization'
                ]
pprint.pprint(company_name_raw)

['Fusemachines  INC',
 'FUSEMACHINES      NEPAL, INC.',
 'FUSEMACHINES',
 'fusemachines',
 'fuse org.',
 'fuse organization']


In [2]:
company_names_lower = [c.lower() for c in company_name_raw]
pprint.pprint(company_names_lower)

['fusemachines  inc',
 'fusemachines      nepal, inc.',
 'fusemachines',
 'fusemachines',
 'fuse org.',
 'fuse organization']


In [4]:
irrelevent_seperators = re.compile(r'[^a-z0-9\s]')
company_names_rel=[irrelevent_seperators.sub(' ',c)for c in company_names_lower]
pprint.pprint(company_names_rel)

['fusemachines  inc',
 'fusemachines      nepal  inc ',
 'fusemachines',
 'fusemachines',
 'fuse org ',
 'fuse organization']


In [5]:
multispace_regex = re.compile(r'\s\s+')
company_names_multispace = [multispace_regex.sub(' ',c).strip() for c in company_names_rel]
pprint.pprint(company_names_multispace)

['fusemachines inc',
 'fusemachines nepal inc',
 'fusemachines',
 'fusemachines',
 'fuse org',
 'fuse organization']


In [6]:
business_stopwords = {  
    'nepal',
    'inc',
    'company',
    'ltd',
    'pvt',
    'incorporated',
    'co',
    'organization',
    'org'
    #'...'
}

In [8]:
company_names_clean = [
    ' '.join([c_part for c_part in c.split() if c_part not in business_stopwords])
    for c in company_names_multispace
]
pprint.pprint(company_names_clean)

['fusemachines', 'fusemachines', 'fusemachines', 'fusemachines', 'fuse', 'fuse']


In [9]:
!pip install probablepeople
import probablepeople as pp

Collecting probablepeople
  Downloading probablepeople-0.5.4-py2.py3-none-any.whl (888 kB)
Collecting python-crfsuite>=0.8
  Downloading python_crfsuite-0.9.8-cp39-cp39-win_amd64.whl (158 kB)
Collecting doublemetaphone
  Downloading DoubleMetaphone-1.1-cp39-cp39-win_amd64.whl (28 kB)
Collecting probableparsing
  Downloading probableparsing-0.0.1-py2.py3-none-any.whl (3.1 kB)
Installing collected packages: python-crfsuite, probableparsing, doublemetaphone, probablepeople
Successfully installed doublemetaphone-1.1 probableparsing-0.0.1 probablepeople-0.5.4 python-crfsuite-0.9.8


In [10]:
pp.parse('FUSEMACHINES      NEPAL, INC.')

[('FUSEMACHINES', 'CorporationName'),
 ('NEPAL,', 'CorporationName'),
 ('INC.', 'CorporationLegalType')]

In [11]:
company_names_alternative_1=[
    [
        parsed_value
        for parsed_value,parsed_type
        in pp.parse(c)
        if parsed_type == 'CorporationName'
    ]
    for c in company_name_raw
]
pprint.pprint(company_names_alternative_1)

[['Fusemachines'],
 ['FUSEMACHINES', 'NEPAL,'],
 ['FUSEMACHINES'],
 ['fusemachines'],
 ['fuse', 'org.'],
 ['fuse', 'organization']]


In [12]:
#correct phone
phone_numbers_raw = [
                 '800/506-3873',\
                 '1-800-506-3873',\
                 '800-506/3873',\
                 '800/506-3873',\
                 '213/665-1661',\
                 '213/665-1661'
                 ]
pprint.pprint(phone_numbers_raw)

['800/506-3873',
 '1-800-506-3873',
 '800-506/3873',
 '800/506-3873',
 '213/665-1661',
 '213/665-1661']


In [13]:
!pip install phonenumbers
import phonenumbers

Collecting phonenumbers
  Downloading phonenumbers-8.13.4-py2.py3-none-any.whl (2.6 MB)
Installing collected packages: phonenumbers
Successfully installed phonenumbers-8.13.4


In [14]:
clean_ph = list()
for i in phone_numbers_raw:
    clean_ph.append(phonenumbers.format_number(phonenumbers.parse(i,'US'),\
                                              num_format = phonenumbers.PhoneNumberFormat.E164))

In [15]:
clean_ph

['+18005063873',
 '+18005063873',
 '+18005063873',
 '+18005063873',
 '+12136651661',
 '+12136651661']

In [16]:
from phonenumbers import timezone

In [17]:
timezone.time_zones_for_number(phonenumbers.parse('+977-1-4168530',"NP"))

('Asia/Katmandu',)

In [18]:
import pandas as pd 
raw_data = pd.DataFrame({'company':company_name_raw, 'phone number': phone_numbers_raw})
raw_data

Unnamed: 0,company,phone number
0,Fusemachines INC,800/506-3873
1,"FUSEMACHINES NEPAL, INC.",1-800-506-3873
2,FUSEMACHINES,800-506/3873
3,fusemachines,800/506-3873
4,fuse org.,213/665-1661
5,fuse organization,213/665-1661


In [19]:
clean_data = pd.DataFrame({'company':company_names_clean,'phone number':clean_ph})
clean_data

Unnamed: 0,company,phone number
0,fusemachines,18005063873
1,fusemachines,18005063873
2,fusemachines,18005063873
3,fusemachines,18005063873
4,fuse,12136651661
5,fuse,12136651661


In [20]:
clean_data.drop_duplicates()

Unnamed: 0,company,phone number
0,fusemachines,18005063873
4,fuse,12136651661


In [21]:
#common duplication
import pandas as pd
import numpy as np
data = [
    (10.5,"M1",'C3',5,5),
    (8.3,"M2",'A3',6.1,6.1),
    (8,"M1",'C3',2.5,2.5),
    (11.99,"M3",'B3',5.1,5.1),
]
df = pd.DataFrame(data, columns=['f1', 'f2', 'f3','f4','f5'])
df

Unnamed: 0,f1,f2,f3,f4,f5
0,10.5,M1,C3,5.0,5.0
1,8.3,M2,A3,6.1,6.1
2,8.0,M1,C3,2.5,2.5
3,11.99,M3,B3,5.1,5.1


In [22]:
df = df.T.drop_duplicates().T

In [23]:
df

Unnamed: 0,f1,f2,f3,f4
0,10.5,M1,C3,5.0
1,8.3,M2,A3,6.1
2,8.0,M1,C3,2.5
3,11.99,M3,B3,5.1


In [24]:
df.f2 = df['f2'].factorize()[0]+1
df.f3 = df['f3'].factorize()[0]+1

In [25]:
df

Unnamed: 0,f1,f2,f3,f4
0,10.5,1,1,5.0
1,8.3,2,2,6.1
2,8.0,1,1,2.5
3,11.99,3,3,5.1


In [26]:
df = df.T.drop_duplicates().T

In [27]:
df

Unnamed: 0,f1,f2,f4
0,10.5,1,5.0
1,8.3,2,6.1
2,8.0,1,2.5
3,11.99,3,5.1
