In [63]:
import pandas as pd
import numpy as np
import regex
from fuzzywuzzy import fuzz
import phonenumbers
import tldextract as tdl
import pycountry
import json

In [2]:
full_df=pd.read_parquet('../../datasets/output/full_df_web_enriched.parquet')

fb_norm_df=pd.read_parquet('../../datasets/working/fb_norm.parquet')
g_norm_df=pd.read_parquet('../../datasets/working/g_norm.parquet')
w_norm_df=pd.read_parquet('../../datasets/working/w_norm.parquet')

In [3]:
social_media_patterns = [
    r"(?i).*facebook\.com$",   # Facebook
    r"(?i).*twitter\.com$",    # Twitter
    r"(?i).*instagram\.com$",  # Instagram
    ]

def is_social_media_domain(domain):
    if pd.isnull(domain):
        return np.nan
    elif isinstance(domain, str):
        for pattern in social_media_patterns:
            if regex.match(pattern, domain):
                return 'Y'
        return 'N'
    else:
        return domain

def calculate_company_name_fuzzy_score(row, column_x, column_y):
    company_name_x = str(row[column_x])  # Convert to string
    company_name_y = str(row[column_y])  # Convert to string
    
    # Handle NaN values
    if company_name_x == 'nan' or company_name_y == 'nan':
        return 0
    
    return fuzz.ratio(company_name_x, company_name_y)

def check_category_inclusion(row):
    result = np.nan
    category_x = str(row['category_x']) # Convert to string
    category_y = str(row['category_y']) # Convert to string
    if (category_x == 'nan' and category_y == 'nan') or (category_x != 'nan' and category_y == 'nan'):
        result = 0
    else:
        if category_x == 'nan' and category_y != 'nan':
            result = 1
        else:
            if category_x in category_y:
                result = 0
            else:
                result = 1
    return result

def get_country_from_phone(phone_number):
    try:
        parsed_number = phonenumbers.parse(phone_number, None)
        country_code = phonenumbers.region_code_for_number(parsed_number)
        # country_name = phonenumbers.region_name_for_number(parsed_number)
        return country_code.lower()
    except phonenumbers.phonenumberutil.NumberParseException:
        return None
    
def extract_tdl(domain):
    pattern = r"(.*\.)(.*$)"
    match = regex.search(pattern, domain)
    if match:
        return match.group(2)
    else:
        return None

def is_country_code_valid(country_code):
    try:
        country = pycountry.countries.get(alpha_2=country_code)
        if country:
            return country.alpha_2
        else:
            return None
    except KeyError:
        return None

def extract_zip_code(row, address_column, country_code_column):
    country_code = str(row[country_code_column])
    text = str(row[address_column])
    pattern = ""

    if country_code == "us":
        pattern = r"(?i)\b\d{5}(?:-\d{4})?\b"
    elif country_code == "ca":
        pattern = r"(?i)\b[A-Za-z]\d[A-Za-z] \d[A-Za-z]\d\b"
    elif country_code == "gb":
        pattern = r"(?i)\b(?:[A-Z]{1,2}\d[A-Z\d]?|\d[A-Z\d]{1,2}) \d[A-Z]{2}\b"
    elif country_code == "uk":
        pattern = r"(?i)\b(?:[A-Z]{1,2}\d[A-Z\d]?|\d[A-Z\d]{1,2}) \d[A-Z]{2}\b"
    elif country_code == "au":
        pattern = r"(?i)\b\d{4}\b"
    elif country_code == "tr":
        pattern = r"(?i)\b\d{5}\b"
    else:
        return None

    match = regex.search(pattern, text)
    if match:
        return match.group(0)
    else:
        return None

In [4]:
matched_data = pd.read_parquet('../../datasets/working/merged_G_fb.parquet')

mask_websites = full_df['identifier'].str.contains('website')
mask_google = full_df['identifier'].str.contains('google')
mask_facebook = full_df['identifier'].str.contains('facebook')


In [5]:
total_records = len(full_df)
null_country  = len(full_df[full_df['country_code'].isnull()])
null_domain  = len(full_df[full_df['domain'].isnull()])
null_city = len(full_df[full_df['city'].isnull()])
null_zip = len(full_df[full_df['zip_code'].isnull()])
null_phones = len(full_df[full_df['phone_parsed'].isnull()])
null_category = len(full_df[full_df['category'].isnull()])
null_address = len(full_df[full_df['address'].isnull()])

print(f'Total records                 : {total_records}    [100.00 %]')
print(f'Total records no country      :  {null_country}    [ {round((null_country/total_records)*100,2)} %]')
print(f'Total records no domain       :      {null_domain}    [  {round((null_domain/total_records)*100,2)}  %]')
print(f'Total records no city         :  {null_city}    [ {round((null_city/total_records)*100,2)} %]')
print(f'Total records no zip          :  {null_zip}    [ {round((null_zip/total_records)*100,2)} %]')
print(f'Total records no phone_parsed :  {null_phones}    [  {round((null_phones/total_records)*100,2)} %]')
print(f'Total records no category     :  {null_category}    [ {round((null_category/total_records)*100,2)} %]')
print(f'Total records no address      :  {null_address}    [  {round((null_address/total_records)*100,2)} %]')

Total records                 : 500503    [100.00 %]
Total records no country      :  62702    [ 12.53 %]
Total records no domain       :      1    [  0.0  %]
Total records no city         :  64300    [ 12.85 %]
Total records no zip          :  191064    [ 38.17 %]
Total records no phone_parsed :  61666    [  12.32 %]
Total records no category     :  47212    [ 9.43 %]
Total records no address      :  112583    [  22.49 %]


In [6]:
mask_phone_no_country = ((full_df['phone_parsed'].notnull()) & (full_df['country_code'].isnull()))

df = full_df[mask_phone_no_country].copy()

df['country_code']=df['phone_parsed'].apply(get_country_from_phone)
df['country_code'] = df['country_code'].str.lower()

full_df.update(df)




In [7]:
full_df.to_parquet('../../datasets/output/full_df_web_enriched_2.parquet')

In [8]:
total_records = len(full_df)
null_country  = len(full_df[full_df['country_code'].isnull()])
null_domain  = len(full_df[full_df['domain'].isnull()])
null_city = len(full_df[full_df['city'].isnull()])
null_zip = len(full_df[full_df['zip_code'].isnull()])
null_phones = len(full_df[full_df['phone_parsed'].isnull()])
null_category = len(full_df[full_df['category'].isnull()])
null_address = len(full_df[full_df['address'].isnull()])

print(f'Total records                 : {total_records}    [100.00 %]')
print(f'Total records no country      :  {null_country}    [ {round((null_country/total_records)*100,2)} %]')
print(f'Total records no domain       :      {null_domain}    [  {round((null_domain/total_records)*100,2)}  %]')
print(f'Total records no city         :  {null_city}    [ {round((null_city/total_records)*100,2)} %]')
print(f'Total records no zip          :  {null_zip}    [ {round((null_zip/total_records)*100,2)} %]')
print(f'Total records no phone_parsed :  {null_phones}    [  {round((null_phones/total_records)*100,2)} %]')
print(f'Total records no category     :  {null_category}    [ {round((null_category/total_records)*100,2)} %]')
print(f'Total records no address      :  {null_address}    [  {round((null_address/total_records)*100,2)} %]')

Total records                 : 500503    [100.00 %]
Total records no country      :  28296    [ 5.65 %]
Total records no domain       :      1    [  0.0  %]
Total records no city         :  64300    [ 12.85 %]
Total records no zip          :  191064    [ 38.17 %]
Total records no phone_parsed :  61666    [  12.32 %]
Total records no category     :  47212    [ 9.43 %]
Total records no address      :  112583    [  22.49 %]


In [9]:

mask_address_not_null = ((full_df['country_code'].isnull()) & (full_df['social_media_flag'] == 'N'))

missing_address = full_df[mask_address_not_null].copy()

missing_address['tdl'] = missing_address['domain'].apply(extract_tdl)
missing_address['is_tdl_country'] = missing_address['tdl'].apply(is_country_code_valid)
missing_address['country_code'] = np.where(missing_address['is_tdl_country'].notnull(), missing_address['tdl'],None)
missing_address.drop(columns=['tdl','is_tdl_country'], axis=1, inplace=True)

full_df.update(missing_address)

In [10]:
total_records = len(full_df)
null_country  = len(full_df[full_df['country_code'].isnull()])
null_domain  = len(full_df[full_df['domain'].isnull()])
null_city = len(full_df[full_df['city'].isnull()])
null_zip = len(full_df[full_df['zip_code'].isnull()])
null_phones = len(full_df[full_df['phone_parsed'].isnull()])
null_category = len(full_df[full_df['category'].isnull()])
null_address = len(full_df[full_df['address'].isnull()])

print(f'Total records                 : {total_records}    [100.00 %]')
print(f'Total records no country      :  {null_country}    [ {round((null_country/total_records)*100,2)} %]')
print(f'Total records no domain       :      {null_domain}    [  {round((null_domain/total_records)*100,2)}  %]')
print(f'Total records no city         :  {null_city}    [ {round((null_city/total_records)*100,2)} %]')
print(f'Total records no zip          :  {null_zip}    [ {round((null_zip/total_records)*100,2)} %]')
print(f'Total records no phone_parsed :  {null_phones}    [  {round((null_phones/total_records)*100,2)} %]')
print(f'Total records no category     :  {null_category}    [ {round((null_category/total_records)*100,2)} %]')
print(f'Total records no address      :  {null_address}    [  {round((null_address/total_records)*100,2)} %]')

Total records                 : 500503    [100.00 %]
Total records no country      :  19374    [ 3.87 %]
Total records no domain       :      1    [  0.0  %]
Total records no city         :  64300    [ 12.85 %]
Total records no zip          :  191064    [ 38.17 %]
Total records no phone_parsed :  61666    [  12.32 %]
Total records no category     :  47212    [ 9.43 %]
Total records no address      :  112583    [  22.49 %]


In [11]:
mask_zip_null = full_df['zip_code'].isnull()
countries = full_df[mask_zip_null]['country_code'].value_counts()
mask_zip_null = full_df['zip_code'].isnull() & full_df['address'].notnull()
missing_zip = full_df[mask_zip_null].copy()
missing_zip['zip_code'] = missing_zip.apply(extract_zip_code, address_column='address', country_code_column='country_code', axis=1)


full_df.update(missing_zip)


In [47]:
total_records = len(full_df)
null_country  = len(full_df[full_df['country_code'].isnull()])
null_domain  = len(full_df[full_df['domain'].isnull()])
null_city = len(full_df[full_df['city'].isnull()])
null_zip = len(full_df[full_df['zip_code'].isnull()])
null_phones = len(full_df[full_df['phone_parsed'].isnull()])
null_category = len(full_df[full_df['category'].isnull()])
null_address = len(full_df[full_df['address'].isnull()])

print(f'Total records                 : {total_records}    [100.00 %]')
print(f'Total records no country      :  {null_country}    [ {round((null_country/total_records)*100,2)} %]')
print(f'Total records no domain       :      {null_domain}    [  {round((null_domain/total_records)*100,2)}  %]')
print(f'Total records no city         :  {null_city}    [ {round((null_city/total_records)*100,2)} %]')
print(f'Total records no zip          :  {null_zip}    [ {round((null_zip/total_records)*100,2)} %]')
print(f'Total records no phone_parsed :  {null_phones}    [  {round((null_phones/total_records)*100,2)} %]')
print(f'Total records no category     :  {null_category}    [ {round((null_category/total_records)*100,2)} %]')
print(f'Total records no address      :  {null_address}    [  {round((null_address/total_records)*100,2)} %]')

Total records                 : 500503    [100.00 %]
Total records no country      :  19374    [ 3.87 %]
Total records no domain       :      1    [  0.0  %]
Total records no city         :  64300    [ 12.85 %]
Total records no zip          :  181168    [ 36.2 %]
Total records no phone_parsed :  61666    [  12.32 %]
Total records no category     :  47212    [ 9.43 %]
Total records no address      :  112583    [  22.49 %]


### Full DF enrichment with Facebook Data
- i will make a inner join on phone, zip_code, country_code.

In [38]:
mask_no_null_on_join = full_df['phone_parsed'].notnull() & full_df['zip_code'].notnull() & full_df['country_code'].notnull()
mask_no_null_on_join_fb = fb_norm_df['phone_parsed'].notnull() & fb_norm_df['zip_code'].notnull() & fb_norm_df['country_code'].notnull()
enrich_fb = full_df[~mask_facebook & mask_no_null_on_join].merge(fb_norm_df[mask_no_null_on_join_fb], how='inner', on=['phone_parsed','zip_code','country_code'])

In [39]:
def calculate_company_name_fuzzy_score(row):
    company_name_x = str(row['company_name_norm_x'])  # Convert to string
    company_name_y = str(row['company_name_norm_y'])  # Convert to string
    
    # Handle NaN values
    if company_name_x == 'nan' or company_name_y == 'nan':
        return 0
    
    return fuzz.ratio(company_name_x, company_name_y)

def check_category_inclusion(row):
    result = np.nan
    category_x = str(row['category_x']) # Convert to string
    category_y = str(row['category_y']) # Convert to string
    if (category_x == 'nan' and category_y == 'nan') or (category_x != 'nan' and category_y == 'nan'):
        result = 0
    else:
        if category_x == 'nan' and category_y != 'nan':
            result = 1
        else:
            if category_x in category_y:
                result = 0
            else:
                result = 1
    return result

In [40]:
enrich_fb['name_fuzzy_score'] = enrich_fb.apply(calculate_company_name_fuzzy_score, axis=1)
enrich_fb['category_inclusion_flag'] = enrich_fb.apply(check_category_inclusion,axis=1)

In [41]:
enrich_fb['domain_x'] = np.where(((enrich_fb['social_media_flag']=='Y') & (enrich_fb['name_fuzzy_score']>50)), enrich_fb['domain_y'], enrich_fb['domain_x'])
enrich_fb['domain_enriching_id'] = np.where(((enrich_fb['social_media_flag']=='Y') & (enrich_fb['name_fuzzy_score']>50)), enrich_fb['identifier_y'], None)

#Category Enrichment from Websites
enrich_fb['category_x'] = np.where( 
      ((enrich_fb['category_inclusion_flag']>0) & (enrich_fb['category_x'].notnull())) 
    &(
      ((enrich_fb['name_fuzzy_score']>50) & (enrich_fb['category_y'].notnull()))
    ),
    enrich_fb['category_x'] + '|' + enrich_fb['category_y'],
    np.where(
        ((enrich_fb['category_inclusion_flag']>0) & (enrich_fb['category_x'].isnull())) 
    &(
      ((enrich_fb['name_fuzzy_score']>50) & (enrich_fb['category_y'].notnull()))
    ),
    enrich_fb['category_y'],
   None
    )
)

enrich_fb['category_enriching_id'] = np.where(
     ((enrich_fb['category_inclusion_flag']>0) & (enrich_fb['category_enriching_id'].isnull())) 
    &(
      ((enrich_fb['name_fuzzy_score']>50) & (enrich_fb['category_y'].notnull()))
    ),
    enrich_fb['identifier_y'],
   np.where(
        ((enrich_fb['category_inclusion_flag']>0) & (enrich_fb['category_enriching_id'].notnull())) 
    &(
      ((enrich_fb['name_fuzzy_score']>50) & (enrich_fb['category_y'].notnull()))
    ),
    enrich_fb['category_enriching_id']+ "|" + enrich_fb['identifier_y'],
    None
   )
)


In [61]:
enrich_fb.rename(columns={
    'identifier_x':'identifier',
    'company_name_norm_x': 'company_name_norm',
    'domain_x':'domain',
    'city_x':'city',
    'category_x':'category',
    'address_x':'address',
},inplace=True)

enrich_fb['identifier_index'] = enrich_fb['identifier']

enrich_fb.set_index('identifier_index',inplace=True)

mask_changed_rows =  enrich_fb['category_enriching_id'].notnull() | enrich_fb['domain_enriching_id'].notnull()

normalised_columns = ['identifier','company_name_norm','country_code','phone_parsed','domain','city','zip_code','category','address','social_media_flag','city_enriching_id','country_enriching_id','phone_enriching_id','category_enriching_id','other_company_name','domain_enriching_id']

df = enrich_fb[mask_changed_rows][normalised_columns]
df = df = df[~df.index.duplicated(keep='first')]
full_df['domain_enriching_id'] = None

full_df.update(df)


In [62]:
total_records = len(full_df)
null_country  = len(full_df[full_df['country_code'].isnull()])
null_domain  = len(full_df[full_df['domain'].isnull()])
null_city = len(full_df[full_df['city'].isnull()])
null_zip = len(full_df[full_df['zip_code'].isnull()])
null_phones = len(full_df[full_df['phone_parsed'].isnull()])
null_category = len(full_df[full_df['category'].isnull()])
null_address = len(full_df[full_df['address'].isnull()])

print(f'Total records                 : {total_records}    [100.00 %]')
print(f'Total records no country      :  {null_country}    [ {round((null_country/total_records)*100,2)} %]')
print(f'Total records no domain       :      {null_domain}    [  {round((null_domain/total_records)*100,2)}  %]')
print(f'Total records no city         :  {null_city}    [ {round((null_city/total_records)*100,2)} %]')
print(f'Total records no zip          :  {null_zip}    [ {round((null_zip/total_records)*100,2)} %]')
print(f'Total records no phone_parsed :  {null_phones}    [  {round((null_phones/total_records)*100,2)} %]')
print(f'Total records no category     :  {null_category}    [ {round((null_category/total_records)*100,2)} %]')
print(f'Total records no address      :  {null_address}    [  {round((null_address/total_records)*100,2)} %]')

Total records                 : 500503    [100.00 %]
Total records no country      :  19374    [ 3.87 %]
Total records no domain       :      1    [  0.0  %]
Total records no city         :  64300    [ 12.85 %]
Total records no zip          :  181168    [ 36.2 %]
Total records no phone_parsed :  61666    [  12.32 %]
Total records no category     :  47019    [ 9.39 %]
Total records no address      :  112583    [  22.49 %]


## Other enriching data that can be done but due to lack of resources i gave up on implementing it:
- similar to the facebook enrichment google enrichemtn should be done.
- address parsing should be done on the free text address. This step should have been done prior to the facebook and google enrichment. This will potentially increase the matching.
- fuzy matching on combination of the company_name_norm, country, address should be done in order to furhter entich and minimse the duplication of the domain data. an example can be found in the fuzzymatch.py and the merged_G_fb.parquet.
- Also a deduplication should be done based on the enriching identifiers, to exclude all the indexed that are found in the enriched columns

In [64]:
full_df['combined_column'] = full_df[['city_enriching_id','country_enriching_id','phone_enriching_id','category_enriching_id','domain_enriching_id']].apply(lambda x: '|'.join(x.dropna().astype(str)), axis=1)
distinct_df = pd.DataFrame({'unique_values': full_df['combined_column'].str.split('|').explode().unique()})


In [65]:
distinct_df.set_index('unique_values',inplace=True)
full_df_filtered = full_df[~full_df.index.isin(distinct_df.index)]

# Creating final dataset

In [71]:
def create_json_address(df):
    json_addresses = []
    unique_addresses = set()
    for _, row in df.iterrows():
        address = {
            'id': row['identifier'],
            'phone': row['phone_parsed'],
            # 'country': row['country_code'],
            'city': row['city'],
            'zip_code': row['zip_code'],
            'raw_address': row['address']
        }
        address_str = json.dumps(address)
        if address_str not in unique_addresses:
            json_addresses.append(address)
            unique_addresses.add(address_str)
        
    return json.dumps(json_addresses)

In [73]:
agg_data = full_df.groupby(['company_name_norm','country_code','domain'])\
                     .apply(lambda df: pd.Series({
                                          'rows_count': len(df),
                                          'categories': '|'.join(df['category'].dropna().unique()),
                                          'addresses': create_json_address(df)
                                                        })).reset_index()

In [74]:
agg_data.to_parquet('../../datasets/output/final_agg_data.parquet')

In [82]:
total_records = len(agg_data)
null_country  = len(agg_data[agg_data['country_code']==''])
null_domain  = len(agg_data[agg_data['domain']==''])
null_category = len(agg_data[agg_data['categories']==''])
null_address = len(agg_data[agg_data['addresses']==''])

print(f'Total records                 : {total_records}    [100.00 %]')
print(f'Total records no country      :      {null_country}    [  {round((null_country/total_records)*100,2)}  %]')
print(f'Total records no domain       :      {null_domain}    [  {round((null_domain/total_records)*100,2)}  %]')
print(f'Total records no category     :  {null_category}    [  {round((null_category/total_records)*100,2)} %]')
print(f'Total records no address      :      {null_address}    [  {round((null_address/total_records)*100,2)}  %]')

Total records                 : 419349    [100.00 %]
Total records no country      :      0    [  0.0  %]
Total records no domain       :      0    [  0.0  %]
Total records no category     :  35565    [  8.48 %]
Total records no address      :      0    [  0.0  %]
