In [3]:
import pandas as pd
import recordlinkage

import mitosheet

# This script will deduplicate our data using the record linkage toolkit

In [None]:
'''
Steps:
    (1) Formatting
            Read in input file and dropp all entries where Street_Number, or Street_Name are empty
            Restrict to single Province or Territory, and read in OpenAddress data for that province
            Make everything a string
            Clean address columns (remove excess white space, punctuation, etc)
    (2) Identify duplicates
            
    (3) Scrap duplicates

'''


'''
For now, let's do the following:
https://towardsdatascience.com/performing-deduplication-with-record-linkage-and-supervised-learning-b01a66cc6882
Then adjust based on past scripts, which didn't look like we could just copy and paste

- Filter the dataset to just 8 or so columns
- Test for just one province (we'll need to chunk later on)


'''

In [9]:
# Input data
df = pd.read_csv('~/ODBiz/6-AssignCSDs/NAICS_Final.csv', low_memory=False)

In [4]:
df_input = df

In [9]:
df = df.astype(str).apply(lambda x: x.str.lower())

In [6]:
mitosheet.sheet(df, analysis_to_replay="id-ledhmxfmtp")

MitoWidget(analysis_data_json='{"analysisName": "id-ledhmxfmtp", "analysisToReplay": null, "code": [], "stepSu…

In [10]:
list(df.columns)

['index',
 'idx',
 'localfile',
 'business_name',
 'alt_business_name',
 'business_sector',
 'business_subsector',
 'business_description',
 'business_id_no',
 'licence_number',
 'licence_type',
 'primary_NAICS',
 'secondary_NAICS',
 'NAICS_descr',
 'NAICS_descr2',
 'alt_econ_act_code',
 'alt_econ_act_descrip',
 'latitude',
 'longitude',
 'full_address',
 'full_address_2',
 'mailing_address',
 'postal_code',
 'unit',
 'street_no',
 'street_name',
 'street_direction',
 'street_type',
 'city',
 'province',
 'country',
 'business_website',
 'email',
 'telephone',
 'telephone_extension',
 'toll_free_telephone',
 'fax',
 'total_no_employees',
 'no_full_time',
 'no_part_time',
 'no_seasonal',
 'date_established',
 'indigenous',
 'status',
 'provider',
 'geo_source',
 'formatted_en',
 'formatted_fr',
 'csdname_oda',
 'keep_match',
 'no_match_reason',
 'keep_gc',
 'geometry',
 'index_right',
 'CSDUID',
 'CSDNAME',
 'PRUID',
 'NAICS_Group',
 'NAICS']

In [None]:
df = df[df['province'] == ]

In [5]:
df['city'].value_counts()

TORONTO, ON          119239
Vancouver            109613
Montréal (Québec)     45806
Edmonton              38693
Calgary               34546
                      ...  
CP h2g2h3                 1
CP h1p 1x7                1
CP h1m3v6                 1
CP h1e1p1                 1
Torreon Coahuila          1
Name: city, Length: 11124, dtype: int64

In [None]:
df_nan = 

In [6]:
df_nb = df[df['province'] == 'bc']

In [7]:
df_dup = df[df.duplicated(subset=['business_name', 'province', 'city', 'licence_number'], keep=False)]

In [8]:
len(df_dup)

78313

In [57]:
mitosheet.sheet(df_dup, analysis_to_replay="id-oskniyzboa")

MitoWidget(analysis_data_json='{"analysisName": "id-cerbgqscuh", "analysisToReplay": {"analysisName": "id-oskn…

In [None]:
from mitosheet import *; register_analysis("id-oskniyzboa");
    
# Sorted business_name in ascending order
df_dup = df_dup.sort_values(by='business_name', ascending=True, na_position='first')


In [6]:
# Detect exact match duplicates (for testing)
df_dup = df[df.duplicated(subset=['business_name', 'province', 'city', 'licence_number'], keep='first')]

In [38]:
# find exact matches for license number
df_dup = df[(df.duplicated(subset=['licence_number'], keep=False))]

In [39]:
df_dup = df_dup.sort_values(by='licence_number', ascending=True, na_position='first')

In [24]:
df_dup = df_dup.dropna(subset=['licence_number'], inplace=True)

In [21]:
df_dup = df_dup[df_dup['licence_number'].notna()]


In [41]:
df_dup = df_dup[df_dup['licence_number'] != 'nan']

In [42]:
len(df_dup)

14075

In [31]:
df_dup['licence_number'].isna().sum()

0

In [9]:

# Create an indexer
indexer = recordlinkage.Index()

# Block on 'business_name', 'province', 'city', 'licence_number'
indexer.block('business_name')
indexer.block('province')
indexer.block('city')
indexer.block('licence_number')

# Create a candidate index
candidate_index = indexer.index(df)

# Create a comparison object
compare_cl = recordlinkage.Compare()

# Compare 'business_name', 'province', 'city', 'licence_number'
compare_cl.string('business_name', 'business_name', method='jarowinkler', threshold=0.85)
compare_cl.exact('province', 'province')
compare_cl.exact('city', 'city')
compare_cl.exact('licence_number', 'licence_number')

# Get a comparison vector
compare_vectors = compare_cl.compute(candidate_index, df)

# Get a list of duplicate indexes
duplicate_indexes = compare_vectors[compare_vectors.sum(axis=1) > 3].index

# Subset the dataframe to the duplicate rows
df_duplicates = df.loc[duplicate_indexes]


In [63]:
mitosheet.sheet(df_dup[df_dup['province'] == 'bc'], analysis_to_replay="id-nurymagnjx")

MitoWidget(analysis_data_json='{"analysisName": "id-xapoleuwwv", "analysisToReplay": {"analysisName": "id-nury…

In [None]:
from mitosheet import *; register_analysis("id-nurymagnjx");
    
# Sorted localfile in ascending order
df_dup[df_dup['province'] == 'bc'] = df_dup[df_dup['province'] == 'bc'].sort_values(by='localfile', ascending=True, na_position='first')

# Sorted licence_number in ascending order
df_dup[df_dup['province'] == 'bc'] = df_dup[df_dup['province'] == 'bc'].sort_values(by='licence_number', ascending=True, na_position='first')


In [61]:
df_bc = df_dup[df_dup['province'] == 'bc']
df_bc['localfile'].value_counts()

bc_burnaby_business_licences_2.csv                          1927
bc_new_westminster_business_licences_(residents).csv        1007
bc_nanaimo_business_licences.csv                             982
bc_victoria_business_licences.csv                            892
bc_vancouver_business_licences.csv                           743
bc_squamish_business_licence_annual_2021.csv                 675
bc_prince_george_business_licence.csv                        377
bc_liquor_licences.csv                                       211
bc_new_westminster_business_licences_(nonresidents).csv      165
bc_burnaby_business_licences.csv                              81
bc_new_westminster_business_licences_(new_this_year).csv       6
Name: localfile, dtype: int64

In [11]:
import recordlinkage

# Create a indexer
indexer = recordlinkage.Index()

# Block on name and address
indexer.block('business_name')
indexer.block('city')

# Create the pairs
pairs = indexer.index(data1, data2)

# Create the comparison object
compare_cl = recordlinkage.Compare()

# Compare the name
compare_cl.string('name', 'name', method='jarowinkler', threshold=0.85)

# Compare the address
compare_cl.string('address', 'address', method='jarowinkler', threshold=0.85)

# Get the matches
matches = compare_cl.compute(pairs, data1, data2)


NameError: name 'pandas' is not defined