In [26]:
# This script identifies pairs of potential duplicates 
# And filters on business name and business status

# We use pandas deduplicate functions to remove exact matches

import json
import pandas as pd
# import re
from rl_helper import strip_accents, AddressClean, haversine
import recordlinkage as rl
from recordlinkage.preprocessing import clean
# from recordlinkage.index import Block
import numpy as np
import math

import mitosheet

In [27]:
# Define functions

def isnan(value):
    try:
        import math
        return math.isnan(float(value))
    except:
        return False

In [28]:
# IMPORT DATA

df_input = pd.read_csv("~/ODBiz/6-AssignCSDs/NAICS_Final.csv", low_memory=False, dtype='str')
print('input length: ', len(df_input))

input length:  558964


In [29]:
# FILTERING

df = df_input

# Remove any with no business name
print('Number removed with business name = NA: ', len(df[df.business_name.isnull()]))
df = df[~df.business_name.isnull()]

# Remove any with inactive status
no_removed = len(df[df.status.isin(['Gone Out of Business', 'Inactive', 'Cancelled'])])
print('Number removed with inactive status: ', no_removed)
df = df[~df.status.isin(['Gone Out of Business', 'Inactive', 'Cancelled'])]

Number removed with business name = NA:  29111
Number removed with inactive status:  58598


In [30]:
# FORMATTTING

# Remove accents
text_cols=['business_name','formatted_en']
for col in text_cols:
    df.loc[~df[col].isnull(),col]=df.loc[~df[col].isnull(),col].apply(strip_accents)

# Remove periods, apostrophes, commas, and hypens in the Name and address columns
r_list=[r".",r",",r"'",r"-"]

for r in r_list:
    df["business_name_2"] = df["business_name"].str.replace(r,' ',regex=False)
    df['formatted_en'] = df['formatted_en'].str.replace(r,' ',regex=False)

# Remove excess whitespace
df["business_name_2"] = df["business_name"].str.replace(r" +"," ",regex=True)
df["formatted_en"] = df["formatted_en"].str.replace(r" +"," ",regex=True)
# df['full_address'] = df['full_address'].str.replace(r" +"," ",regex=True)

# Standardise postal codes - just remove empty space and make sure it's all lower case
# df.loc[~df.postal_code.isnull(),'postal_code'] = df.loc[~df.postal_code.isnull(),'postal_code'].str.replace(' ','').str.lower()

# Some records have street number and street name, but no address field filled
# df.loc[(df.full_address.isnull())&\
#        (~df.formatted_en.isnull()),'full_address']\
#     = clean(df.loc[(df.full_address.isnull())&\
#        (~df.formatted_en.isnull()),'street_no']+' '+\
#            df.loc[(df.full_address.isnull())&\
#        (~df.formatted_en.isnull()),'formatted_en']+' '+\
#         df.loc[(df.full_address.isnull())&\
#        (~df.formatted_en.isnull()),'city'])


In [31]:
# LABEL DUPLICATES 

# Deduplicate and filter strictly on name, street number and street name
df['dupe_1'] = df.duplicated(subset=['business_name_2', 'street_no', 'formatted_en'], keep=False)

# Separate potential duplicates
df_non_dup = df[~df['dupe_1'] == True]
df =  df[df['dupe_1'] == True]

# Sort by street number, then street name, then business name
df = df.sort_values(['business_name_2', 'street_no', 'formatted_en'])


In [32]:
# 3. loop and compare duplicates between rows

printcounter = 0
col_list = ['business_name_2','licence_number', 'postal_code', 'formatted_en', 'business_sector', 'business_description', 'licence_type', 'primary_NAICS']

for i in range(len(df)):
    
    if (printcounter == 10000):
        print('Done: ', i, ' of ', len(df))
        printcounter = 0
    printcounter += 1
    
    # check if the street number is the same as the previous row
    if (i > 1):
        if (df.at[df.index[i],'business_name_2'] == df.at[df.index[i-1],'business_name_2']):
            # then street name
            if (df.at[df.index[i],'formatted_en'] == df.at[df.index[i-1],'formatted_en']):
                # then name
                if (df.at[df.index[i],'street_no'] == df.at[df.index[i-1],'street_no']):

                    # make a small dataframe with just two rows
                    df2 = df.iloc[[i-1, i]]
                    df2 = df2[col_list]

                    # remove columns which have NA for either value
                    df2 = df2.dropna(axis=1)
                    
                    # if non-NA rows are identical, then mark as duplicate
                    if ((df2.iloc[0] == df2.iloc[1]).all()):
                        # duplicate
                        df.at[df.index[i], 'dupe'] = True
                        df.at[df.index[i-1], 'dupe'] = True
                        
                        # copy values over if one column is null     
                        for col in col_list:
                            var_1 = df.at[df.index[i-1], col]
                            var_2 = df.at[df.index[i], col]

                            # if first one is NA and the other isn't
                            if (isnan(var_1) & (not isnan(var_2))):
                                df.at[df.index[i-1], col] = var_2
                        
                        # label the one we're going to keep/ remove
                        df.at[df.index[i-1], 'keep'] = True
                        df.at[df.index[i], 'keep'] = False

                        # add filename to note column of the other

                    # otherwise if non-NA rows don't match, not duplicate
                    else:    
                        df.at[df.index[i], 'dupe'] = False
                        

Done:  10000  of  111438
Done:  20000  of  111438
Done:  30000  of  111438
Done:  40000  of  111438
Done:  50000  of  111438
Done:  60000  of  111438
Done:  70000  of  111438
Done:  80000  of  111438
Done:  90000  of  111438
Done:  100000  of  111438
Done:  110000  of  111438


In [None]:
# POST PROCESSING

# output list of duplicates to be manually checked inspected
df_dup = df[df['dupe'] == True]
df_dup.to_csv('outputs/dupes.csv', encoding='utf-8')

# Remove duplicates
print('Number of duplicates removed: ', len(df[df['dupe'] == True]))
df = df.drop(df[df['keep'] == False].index)

# Merge back in non-duplicates
df_dedup = pd.concat([df, df_non_dup], axis=0)

# Remove any new columns created: 
df_dedup = df_dedup.drop(columns=['business_name_2', 'dupe_1', 'dupe', 'keep'])

print('Number of rows remaining: ', len(df_dedup))
print('Total rows removed from filtering and deduplication: ', len(df_input) - len(df_dedup))
df_dedup.to_csv('outputs/deduplicated.csv', encoding='utf-8')