In [81]:
import pandas as pd
import numpy as np
import regex
from fuzzywuzzy import fuzz

In [82]:
full_df=pd.read_parquet('../../datasets/working/full_df.parquet')

fb_norm_df=pd.read_parquet('../../datasets/working/fb_norm.parquet')
g_norm_df=pd.read_parquet('../../datasets/working/g_norm.parquet')
w_norm_df=pd.read_parquet('../../datasets/working/w_norm.parquet')

full_df.dtypes

identifier           object
company_name_norm    object
country_code         object
phone_parsed         object
domain               object
city                 object
zip_code             object
category             object
address              object
dtype: object

In [83]:
google_domains = g_norm_df['domain'].value_counts()
facebook_domains = fb_norm_df['domain'].value_counts()
websites_domains = w_norm_df['domain'].value_counts()

In [84]:
social_media_patterns = [
    r"(?i).*facebook\.com$",   # Facebook
    r"(?i).*twitter\.com$",    # Twitter
    r"(?i).*instagram\.com$",  # Instagram
    ]

def is_social_media_domain(domain):
    if pd.isnull(domain):
        return np.nan
    elif isinstance(domain, str):
        for pattern in social_media_patterns:
            if regex.match(pattern, domain):
                return 'Y'
        return 'N'
    else:
        return domain
    
def calculate_company_name_fuzzy_score(row):
    company_name_x = str(row['company_name_norm_x'])  # Convert to string
    company_name_y = str(row['company_name_norm_y'])  # Convert to string
    
    # Handle NaN values
    if company_name_x == 'nan' or company_name_y == 'nan':
        return 0
    
    return fuzz.ratio(company_name_x, company_name_y)

def check_category_inclusion(row):
    result = np.nan
    category_x = str(row['category_x']) # Convert to string
    category_y = str(row['category_y']) # Convert to string
    if (category_x == 'nan' and category_y == 'nan') or (category_x != 'nan' and category_y == 'nan'):
        result = 0
    else:
        if category_x == 'nan' and category_y != 'nan':
            result = 1
        else:
            if category_y in category_x:
                result = 0
            else:
                result = 1
    return result

In [85]:
full_df['social_media_flag'] = full_df['domain'].apply(is_social_media_domain)
mask_no_social_media = (full_df['social_media_flag'] == 'N')

g_soc_med = full_df[~mask_no_social_media].copy()

full_df_no_soc = full_df[mask_no_social_media].copy()


In [86]:
mask_websites = full_df['identifier'].str.contains('website')
mask_google = full_df['identifier'].str.contains('google')
mask_facebook = full_df['identifier'].str.contains('facebook')

#Getting full_df wihtout websites to enrich it furhter with data in websites

no_w = full_df[~mask_websites].merge(w_norm_df, how='left', on=['domain']) #merging on domain as there is a lot of consistency and low missing values
no_w.reset_index(drop=True, inplace=True)

no_w['name_fuzzy_score'] = no_w.apply(calculate_company_name_fuzzy_score, axis=1)
no_w['category_inclusion_flag'] = no_w.apply(check_category_inclusion,axis=1)

In [87]:
total_records = len(no_w)
null_country  = len(no_w[no_w['country_code_x'].isnull()])
null_domain  = len(no_w[no_w['domain'].isnull()])
null_city = len(no_w[no_w['city_x'].isnull()])
null_zip = len(no_w[no_w['zip_code_x'].isnull()])
null_phones = len(no_w[no_w['phone_parsed_x'].isnull()])
null_category = len(no_w[no_w['category_x'].isnull()])
null_address = len(no_w[no_w['address_x'].isnull()])

print(f'Total records                 : {total_records}    [100.00 %]')
print(f'Total records no country      :  {null_country}    [ {round((null_country/total_records)*100,2)} %]')
print(f'Total records no domain       :      {null_domain}    [  {round((null_domain/total_records)*100,2)}  %]')
print(f'Total records no city         :  {null_city}    [ {round((null_city/total_records)*100,2)} %]')
print(f'Total records no zip          : {null_zip}    [ {round((null_zip/total_records)*100,2)} %]')
print(f'Total records no phone_parsed :  {null_phones}    [ {round((null_phones/total_records)*100,2)} %]')
print(f'Total records no category     :  {null_category}    [ {round((null_category/total_records)*100,2)} %]')
print(f'Total records no address      :  {null_address}    [ {round((null_address/total_records)*100,2)} %]')

Total records                 : 428484    [100.00 %]
Total records no country      :  66543    [ 15.53 %]
Total records no domain       :      0    [  0.0  %]
Total records no city         :  72387    [ 16.89 %]
Total records no zip          : 119046    [ 27.78 %]
Total records no phone_parsed :  59804    [ 13.96 %]
Total records no category     :  68330    [ 15.95 %]
Total records no address      :  40566    [ 9.47 %]


In [88]:
#City Enrichment from Websites
no_w['city_enriched'] = np.where(
    (no_w['city_x'].isnull() & no_w['city_y'].notnull())
    & (
      (no_w['company_name_norm_x'] == no_w['company_name_norm_y'])
    | ((no_w['name_fuzzy_score']>50) & (no_w['city_y'].notnull()))
    | ((no_w['phone_parsed_x'] == no_w['phone_parsed_y']) & no_w['phone_parsed_x'].notnull())
    | ((no_w['company_name_norm_x'] == no_w['company_name_norm_y'] )& (no_w['country_code_x'] == no_w['country_code_y']) & no_w['country_code_x'].notnull())
    ),
    no_w['city_y'],
    None
)

no_w['city_enriching_id'] = np.where(
     (no_w['city_x'].isnull() & no_w['city_y'].notnull())
    & (
      (no_w['company_name_norm_x'] == no_w['company_name_norm_y'])
    | ((no_w['name_fuzzy_score']>50) & (no_w['city_y'].notnull()))
    | ((no_w['phone_parsed_x'] == no_w['phone_parsed_y']) & no_w['phone_parsed_x'].notnull())
    | ((no_w['company_name_norm_x'] == no_w['company_name_norm_y'] )& (no_w['country_code_x'] == no_w['country_code_y']) & no_w['country_code_x'].notnull())
    ),
    no_w['identifier_y'],
    None
)

In [89]:
total_enriched = len(no_w[no_w['city_enriching_id'].notnull()])
print(f'Total enriched cities : {total_enriched} [ {round((total_enriched/null_city)*100,2)} %] [ {round((total_enriched/total_records)*100,2)} %]')

Total enriched cities : 19149 [ 26.45 %] [ 4.47 %]


In [90]:
#Country Enrichment from Websites
no_w['country_enriched'] = np.where( 
      ((no_w['country_code_x'].isnull())) 
    &(
      ((no_w['name_fuzzy_score']>50) & (no_w['country_code_y'].notnull()))
    | ((no_w['phone_parsed_x'] == no_w['phone_parsed_y']) & no_w['phone_parsed_x'].notnull())
    ),
    no_w['country_code_y'],
    None
)

no_w['country_enriching_id'] = np.where(
     ((no_w['country_code_x'].isnull()))
    &(
      ((no_w['name_fuzzy_score']>50) & (no_w['country_code_y'].notnull()))
    | ((no_w['phone_parsed_x'] == no_w['phone_parsed_y']) & no_w['phone_parsed_x'].notnull())
    ),
    no_w['identifier_y'],
    None
)

In [91]:
total_enriched_country = len(no_w[no_w['country_enriching_id'].notnull()])
print(f'Total enriched countries : {total_enriched} [ {round((total_enriched_country/null_country)*100,2)} %] [ {round((total_enriched_country/total_records)*100,2)} %]')

Total enriched countries : 19149 [ 18.79 %] [ 2.92 %]


In [92]:
#Phone Enrichment from Websites
no_w['phone_enriched'] = np.where( 
      ((no_w['phone_parsed_x'].isnull()) | (no_w['phone_parsed_x']=='nan') ) 
    &(
      ((no_w['name_fuzzy_score']>50) & (no_w['phone_parsed_y'].notnull()))
    ),
    no_w['phone_parsed_y'],
    None
)

no_w['phone_enriching_id'] = np.where(
     ((no_w['phone_parsed_x'].isnull()) | (no_w['phone_parsed_x']=='nan') ) 
    &(
      ((no_w['name_fuzzy_score']>50) & (no_w['phone_parsed_y'].notnull()))
    ),
    no_w['identifier_y'],
    None
)

In [93]:
total_enriched_phones = len(no_w[no_w['phone_enriching_id'].notnull()])
print(f'Total enriched phones : {total_enriched} [ {round((total_enriched_phones/null_phones)*100,2)} %] [ {round((total_enriched_phones/total_records)*100,2)} %]')

Total enriched phones : 19149 [ 16.6 %] [ 2.32 %]


In [94]:
#Category Enrichment from Websites
no_w['category_enriched'] = np.where( 
      ((no_w['category_inclusion_flag']>0) & (no_w['category_x'].notnull())) 
    &(
      ((no_w['name_fuzzy_score']>50) & (no_w['category_y'].notnull()))
    | ((no_w['phone_parsed_x'] == no_w['phone_parsed_y']) & no_w['phone_parsed_x'].notnull())
    ),
    no_w['category_x'] + '|' + no_w['category_y'],
    np.where(
        ((no_w['category_inclusion_flag']>0) & (no_w['category_x'].isnull())) 
    &(
      ((no_w['name_fuzzy_score']>50) & (no_w['category_y'].notnull()))
    | ((no_w['phone_parsed_x'] == no_w['phone_parsed_y']) & no_w['phone_parsed_x'].notnull())
    ),
    no_w['category_y'],
    None
    )
)

no_w['category_enriching_id'] = np.where(
     ((no_w['category_inclusion_flag']>0)) 
    &(
      ((no_w['name_fuzzy_score']>50) & (no_w['category_y'].notnull()))
    | ((no_w['phone_parsed_x'] == no_w['phone_parsed_y']) & no_w['phone_parsed_x'].notnull())
    ),
    no_w['identifier_y'],
    None
)

In [95]:
total_enriched_categories = len(no_w[no_w['category_enriching_id'].notnull()])
print(f'Total enriched categories : {total_enriched} [ {round((total_enriched_categories/null_category)*100,2)} %] [ {round((total_enriched_categories/total_records)*100,2)} %]')

Total enriched categories : 19149 [ 134.55 %] [ 21.46 %]


In [96]:
no_w.to_parquet('../../datasets/working/no_web.parquet')

In [97]:
df = pd.read_parquet('../../datasets/working/no_web.parquet')

In [98]:
mask_country = df['country_enriching_id'].notnull() & df['country_code_x'].isnull()
mask_phone = df['phone_enriching_id'].notnull() & df['phone_parsed_x'].isnull()
mask_city = df['city_enriching_id'].notnull() & df['city_x'].isnull()
mask_category = df['category_enriching_id'].notnull() & df['category_x'].isnull()

In [99]:

df.loc[mask_country, 'country_code_x'] = df.loc[mask_country, 'country_enriched'] #14350
df.loc[mask_phone, 'phone_parsed_x'] = df.loc[mask_phone, 'phone_enriched'] #14350
df.loc[mask_city, 'city_x'] = df.loc[mask_city, 'city_enriched'] #14350
df.loc[mask_category, 'category_x'] = df.loc[mask_category, 'category_enriched'] #14350

In [100]:
total_records = len(df)
null_country  = len(df[df['country_code_x'].isnull()])
null_domain  = len(df[df['domain'].isnull()])
null_city = len(df[df['city_x'].isnull()])
null_zip = len(df[df['zip_code_x'].isnull()])
null_phones = len(df[df['phone_parsed_x'].isnull()])
null_category = len(df[df['category_x'].isnull()])
null_address = len(df[df['address_x'].isnull()])

print(f'Total records                 : {total_records}    [100.00 %]')
print(f'Total records no country      :  {null_country}    [ {round((null_country/total_records)*100,2)} %]')
print(f'Total records no domain       :      {null_domain}    [  {round((null_domain/total_records)*100,2)}  %]')
print(f'Total records no city         :  {null_city}    [ {round((null_city/total_records)*100,2)} %]')
print(f'Total records no zip          : {null_zip}    [ {round((null_zip/total_records)*100,2)} %]')
print(f'Total records no phone_parsed :  {null_phones}    [ {round((null_phones/total_records)*100,2)} %]')
print(f'Total records no category     :  {null_category}    [ {round((null_category/total_records)*100,2)}  %]')
print(f'Total records no address      :  {null_address}    [  {round((null_address/total_records)*100,2)} %]')

Total records                 : 428484    [100.00 %]
Total records no country      :  54182    [ 12.65 %]
Total records no domain       :      0    [  0.0  %]
Total records no city         :  53238    [ 12.42 %]
Total records no zip          : 119046    [ 27.78 %]
Total records no phone_parsed :  49877    [ 11.64 %]
Total records no category     :  45838    [ 10.7  %]
Total records no address      :  40566    [  9.47 %]


In [101]:
#'city_enriching_id','country_enriching_id','phone_enriching_id','category_enriching_id'
mask_changed_rows = df['city_enriching_id'].notnull() | df['country_enriching_id'].notnull() | df['phone_enriching_id'].notnull() | df['category_enriching_id'].notnull()
normalised_columns = ['identifier','company_name_norm','other_company_name','country_code','phone_parsed','domain','city','zip_code','category','address','social_media_flag','city_enriching_id','country_enriching_id','phone_enriching_id','category_enriching_id']
df.rename(columns={
    'identifier_x':'identifier',
    'company_name_norm_x': 'company_name_norm',
    'country_code_x': 'country_code',
    'phone_parsed_x': 'phone_parsed',
    'city_x':'city',
    'zip_code_x':'zip_code',
    'category_x':'category',
    'address_x':'address',
    'company_name_norm_y':'other_company_name',
},inplace=True)
no_w_enriched = df[mask_changed_rows][normalised_columns].copy()

In [102]:
no_w_enriched.to_parquet('../../datasets/intermediar/1.1/no_w_enriched.parquet')

## Bring the updated data back to full_df

In [103]:
full_df['city_enriching_id'] = None
full_df['country_enriching_id'] = None
full_df['phone_enriching_id'] = None
full_df['category_enriching_id'] = None
full_df['other_company_name'] = None

full_df['identifier_index'] = full_df['identifier']
full_df.set_index('identifier_index', inplace=True)

no_w_enriched['identifier_index'] = no_w_enriched['identifier']
no_w_enriched.set_index('identifier_index', inplace=True)

In [104]:
full_df.update(no_w_enriched)

In [105]:
full_df.to_parquet('../../datasets/output/full_df_web_enriched.parquet')