# Imports 

In [93]:
import pandas as pd
import re 
import numpy as np
import datetime
from datetime import datetime

## a couple recordlinkage packages
import fuzzywuzzy
import recordlinkage

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
## modules still being installed on jhub
## nltk for string distance
import nltk

## jarowinkler
from pyjarowinkler import distance

# Load and view dataset 1: tax certificates for San Diego businesses

In [270]:
## general link: https://data.sandiego.gov/datasets/business-listings/

## active tax certificates
sd_active_biz = pd.read_csv("https://seshat.datasd.org/ttcs/sd_businesses_active_datasd.csv")
sd_active_biz.head()

## PPP loans for CA

Unnamed: 0,account_key,account_status,account_status_code,date_account_creation,date_cert_expiration,date_cert_effective,business_owner_name,ownership_type,date_business_start,dba_name,...,address_city,address_state,address_zip,address_suite,address_pmb_box,address_po_box,bid,council_district,lat,lng
0,1974000024,Active,A,1974-07-01 12:00:00,2021-06-30 12:00:00,2020-07-01 12:00:00,PARRON HALL CORP,CORP,1974-07-01 12:00:00,PARRON HALL,...,SAN DIEGO,CA,92123-2697,100.0,,,,7.0,32.806283,-117.120553
1,1974000035,Active,A,1974-07-01 12:00:00,2021-06-30 12:00:00,2020-07-01 12:00:00,UNIV MECHANICAL & ENGINEERING CONTRACTORS INC,CORP,1974-07-01 12:00:00,UNIVERSITY MECHANICAL & ENGINEERING CONTRACTORS,...,EL CAJON,CA,92020-1812,,,,,,32.808827,-116.977019
2,1974000039,Active,A,1974-07-01 12:00:00,2021-06-30 00:00:00,2020-07-01 00:00:00,ADMIRAL EXCHANGE CO INC,CORP,1974-07-01 12:00:00,ADMIRAL EXCHANGE CO INC,...,SAN DIEGO,CA,92121-2613,,,,,6.0,32.881855,-117.167821
3,1974000053,Active,A,1974-07-01 12:00:00,2021-06-30 00:00:00,2020-07-01 00:00:00,R W SMITH & CO INC,CORP,1974-07-01 12:00:00,R W SMITH & COMPANY,...,SAN DIEGO,CA,92131-1650,,,,,5.0,32.898905,-117.109826
4,1974000110,Active,A,1974-07-01 12:00:00,2021-06-30 12:00:00,2020-07-01 12:00:00,KAMAN INDUSTRIAL TECHNOLOGIES CORPORATION,CORP,1974-07-01 12:00:00,KAMAN INDUSTRIAL TECHNOLOGIES CORP,...,BLOOMFIELD,CT,06002-5321,3105.0,,,,,41.858038,-72.703435


## Load and view dataset 2: PPP loans > 150k

General link: https://data.sba.gov/dataset/ppp-foia/resource/3d28c417-5170-4f1f-be31-b0c7b0182501 
        

For a real application, we'd want to programmatically load and rowbind the different < 150k sheets. For this exercise,
we'll just look at the larger loans (>150k) and subset to california

In [271]:
ppp_raw = pd.read_csv("https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/6b62a44b-69ec-436a-9b95-0ea550475543/download/public_150k_plus.csv")

In [21]:

## look at address fields to see whether state is relatively complete
## see that state is only missing about 14 so we (1) subset to CA and 
## (2) subset to State == CA and also
## zips that overlap with ones in the SD tax certificate data
## just using raw-zip but to be more careful, if doing for real,
## we'd want to standardize to either 6 dig zip or 10-dig zip
ppp_raw.BorrowerState.value_counts(dropna = False)
ppp_keep = ppp_raw[(ppp_raw.BorrowerState == "CA") &
                  (ppp_raw.BorrowerZip.isin(sd_active_biz.address_zip))].copy()
ppp_keep.shape


CA     127150
TX      74279
NY      71696
FL      58508
IL      40116
PA      37675
OH      31980
NJ      31620
MI      29242
MA      26863
GA      25170
WA      23197
VA      22797
NC      22292
CO      19386
MN      19336
MD      18562
WI      17119
IN      16323
MO      16003
TN      15548
AZ      15400
LA      13375
OR      13195
CT      12613
AL      10672
SC      10496
OK       9833
UT       9102
KY       9001
IA       8144
NV       8039
KS       8036
AR       5747
NE       5695
MS       5347
NH       4982
HI       4894
NM       4392
ID       4324
DC       4241
ME       4129
RI       3720
WV       3285
ND       3162
MT       3093
DE       2921
PR       2817
SD       2647
AK       2599
VT       2234
WY       2114
GU        425
VI        272
MP         80
AS         18
NaN        14
Name: BorrowerState, dtype: int64

(17104, 51)

In [272]:
## delete the raw version to save memory
del(ppp_raw)

## Step 1 - what are the possible join fields between the two:

San Diego tax certicate:

- Business-level fields:
    - Owner name
    - Business name (dba_name)
    
- Sector-level fields:
    - naics_sector 
    - naics_code
    - naics_description
    
- Geographic fields:
    - City and state 
    - Zip 
    - Bid (business improvement district)
    - Council district
    - Address
    
PPP loans:

- Business-level:
    - BorrowerName
    - Borrower Address
    - BorrowerCity
    - BorrowerState
    - BorrowerZip


## Step 2-- build our matching approach using some manual examples

Examples of two PPP loan recipients:

- THE KLEINFELDER GROUP, INC.
- DURAN FREIGHT CORPORATION

In [273]:
klein_patt = r".*(\s+)?KLEINFELDER\s+.*"
klein_possible = [biz for biz in sd_active_biz.dba_name
                 if re.match(klein_patt, biz) is not None]
klein_possible

['KLEINFELDER CONSTRUCTION SERVICES',
 'CH2M HILL KLEINFELDER & A JOINT VENTURE',
 'KLEINFELDER INC']

In [274]:
duran_patt = r".*(\s+)?DURAN\s+.*"
duran_possible = [biz for biz in sd_active_biz.dba_name
                 if re.match(duran_patt, biz) is not None]
duran_possible

['DURAN MAINTENANCE AND LANDSCAPE MOBILE UNIT',
 'DURAN FREIGHT CORPORATION',
 'OFELIA DURAN GALAVIZ',
 'GERMAN DURAN LANDSCAPE',
 'LETICIA S DURAN MENDOZA',
 'ALEX DURAN FITNESS']

### Investigate fields that could help weed out false matches for the first business

In [275]:
sd_active_biz.columns
ppp_keep.columns

Index(['account_key', 'account_status', 'account_status_code',
       'date_account_creation', 'date_cert_expiration', 'date_cert_effective',
       'business_owner_name', 'ownership_type', 'date_business_start',
       'dba_name', 'naics_sector', 'naics_code', 'naics_description',
       'address_no', 'address_pd', 'address_road', 'address_sfx',
       'address_no_fraction', 'address_city', 'address_state', 'address_zip',
       'address_suite', 'address_pmb_box', 'address_po_box', 'bid',
       'council_district', 'lat', 'lng'],
      dtype='object')

Index(['LoanNumber', 'DateApproved', 'SBAOfficeCode', 'ProcessingMethod',
       'BorrowerName', 'BorrowerAddress', 'BorrowerCity', 'BorrowerState',
       'BorrowerZip', 'LoanStatusDate', 'LoanStatus', 'Term',
       'SBAGuarantyPercentage', 'InitialApprovalAmount',
       'CurrentApprovalAmount', 'UndisbursedAmount', 'FranchiseName',
       'ServicingLenderLocationID', 'ServicingLenderName',
       'ServicingLenderAddress', 'ServicingLenderCity', 'ServicingLenderState',
       'ServicingLenderZip', 'RuralUrbanIndicator', 'HubzoneIndicator',
       'LMIIndicator', 'BusinessAgeDescription', 'ProjectCity',
       'ProjectCountyName', 'ProjectState', 'ProjectZip', 'CD', 'JobsReported',
       'NAICSCode', 'Race', 'Ethnicity', 'UTILITIES_PROCEED',
       'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED',
       'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED',
       'DEBT_INTEREST_PROCEED', 'BusinessType', 'OriginatingLenderLocationID',
       'OriginatingLender', 'Originatin

In [276]:
## defining helpful fields w/in each df for adjudicating matches
ppp_helpfulfields = ["BorrowerName", "BorrowerAddress", "BorrowerCity", 
                    "BorrowerZip", "FranchiseName", "NAICSCode", "ProjectZip"]
sd_helpfulfields = ["dba_name", "naics_code", "naics_sector",
                    "address_no", "address_pd", "address_road",
                    "address_sfx", "address_city", "address_zip", "date_cert_effective",
                   "date_cert_expiration", "business_owner_name"]

In [277]:
print(ppp_keep.loc[ppp_keep.BorrowerName == "THE KLEINFELDER GROUP, INC.",
                  ppp_helpfulfields])


print(sd_active_biz.loc[sd_active_biz.dba_name.isin(klein_possible),
                       sd_helpfulfields])

## see that likely either kleinfelder construction services
## or kleinfelder inc (could match to both); possible slight 
## pref for matching to kleinfelder inc owner name 

                      BorrowerName    BorrowerAddress BorrowerCity  \
34514  THE KLEINFELDER GROUP, INC.  550 West C Street    SAN DIEGO   

      BorrowerZip FranchiseName  NAICSCode  ProjectZip  
34514       92101           NaN   541330.0  92101-0300  
                                      dba_name  naics_code  naics_sector  \
2947         KLEINFELDER CONSTRUCTION SERVICES       54161            54   
24557  CH2M HILL KLEINFELDER & A JOINT VENTURE       54199            54   
33360                          KLEINFELDER INC      541615            54   

      address_no address_pd address_road address_sfx address_city address_zip  \
2947         550          W            C          ST    SAN DIEGO  92101-3532   
24557        402          W     BROADWAY         NaN    SAN DIEGO  92101-8544   
33360        550          W            C          ST    SAN DIEGO  92101-3532   

       date_cert_effective date_cert_expiration  \
2947   2020-12-01 12:00:00  2021-11-30 12:00:00   
24557  2020-1

### Preview of activity step 1: clean addresses in each of the datasets

Previous example shows us address can help adjudicate b/t matches.

When we break into groups, you'll
    
- Create a new zip code col that's just the first 6 digits
- Paste together the address_no, address_pd, address_road, address_sfx fields in the SD active biz to create a field similar to BorrowerAddress in the PPP loan data (pay attention to capitalization; might be easier to capitalize in each)



# Constructing our own matching function

The package we'll review makes matching easier by putting a lot of the hard stuff under the hood

But it's good to know what's going on under that hood.

Here, using the example of THE KLEINFELDER GROUP, INC., we'll look within the tentative matches + a random other sample of the SD business data to construct match points

## Step 0: pool of sd businesses to look in

Normally we'd look in full set but this helps with runtime

In [285]:
sd_biz_lookin = pd.concat([sd_active_biz[sd_active_biz.dba_name.isin(klein_possible)].copy(),
                         sd_active_biz[~sd_active_biz.dba_name.isin(klein_possible)].sample(n = 15, 
                        random_state = 922).copy(),
                          sd_active_biz[sd_active_biz.dba_name == "DURAN FREIGHT CORPORATION"]])

sd_biz_lookin.head()

Unnamed: 0,account_key,account_status,account_status_code,date_account_creation,date_cert_expiration,date_cert_effective,business_owner_name,ownership_type,date_business_start,dba_name,...,address_city,address_state,address_zip,address_suite,address_pmb_box,address_po_box,bid,council_district,lat,lng
2947,1986015690,Active,A,1986-12-18 12:00:00,2021-11-30 12:00:00,2020-12-01 12:00:00,KLEINFELDER CONSTRUCTION SERVICES INC,CORP,1986-12-18 12:00:00,KLEINFELDER CONSTRUCTION SERVICES,...,SAN DIEGO,CA,92101-3532,1200,,,,3.0,32.716819,-117.15971
24557,2008027219,Active,A,2008-08-25 14:22:29,2021-09-30 23:59:59,2020-10-01 23:59:59,CH2M HILL INC & KLEINFELDER WEST INC,PARTNR,2008-09-30 00:00:00,CH2M HILL KLEINFELDER & A JOINT VENTURE,...,SAN DIEGO,CA,92101-8544,1450,,,,3.0,32.715751,-117.161036
33360,2014016589,Active,A,2014-06-05 16:44:28,2021-06-30 23:59:59,2020-07-01 23:59:59,KLEINFELDER INC,CORP,2014-06-09 00:00:00,KLEINFELDER INC,...,SAN DIEGO,CA,92101-3532,1200,,,,3.0,32.716819,-117.15971
21220,2007002502,Active,A,2007-03-05 14:47:46,2022-03-31 23:59:59,2021-04-01 23:59:59,SHAUN YUDELSON,SOLE,2007-03-06 00:00:00,SHAUN YUDELSON,...,SAN DIEGO,CA,92103-5212,,,,,3.0,32.741229,-117.14681
23703,2008004082,Active,A,2008-02-15 11:43:24,2022-02-28 23:59:59,2021-03-01 23:59:59,CRAIG FRAHM,SOLE,2008-02-08 00:00:00,CRAIG FRAHM,...,SAN DIEGO,CA,92130-7620,H304,,,,1.0,32.918741,-117.229652


## Step 1: find string similarity between (1) our focal PPP business (Kleinfelder) and (2) the businesses in the SD pool

Here, we're using Jaccard distance --- common one in addition to that is Jaro Winkler string similarity

Some options here: https://python.gotrained.com/nltk-edit-distance-jaccard-distance/

Can also use fuzzywuzzy installed on jhub- discussion here: https://towardsdatascience.com/fuzzy-string-matching-in-python-68f240d910fe

In [297]:
## first, let's process the biz name
## and remove everything that's not (^)
## words or spaces and also remove the
focal_ppp_raw = "THE KLEINFELDER GROUP, INC."
focal_ppp_cleaner = re.sub("THE\s", 
                           "", 
                    re.sub(r"[^\w\s]", "", focal_ppp_raw))
focal_ppp_cleaner

'KLEINFELDER GROUP INC'

In [298]:
### look at a few different distance metrics
sd_biz_lookin['dist_focal_edit'] = [nltk.edit_distance(focal_ppp_cleaner, other_name)
                     for other_name in sd_biz_lookin.dba_name]

sd_biz_lookin[['dba_name', 'dist_focal_edit']].sort_values(by = 'dist_focal_edit')

sd_biz_lookin['dist_focal_jacc'] = [nltk.jaccard_distance(set(focal_ppp_cleaner), set(other_name))
                     for other_name in sd_biz_lookin.dba_name]

sd_biz_lookin[['dba_name', 'dist_focal_jacc']].sort_values(by = 'dist_focal_jacc')


Unnamed: 0,dba_name,dist_focal_edit
33360,KLEINFELDER INC,6
40248,LAVERTY DESIGNS INC,14
18184,BRIAN CROWER INC,14
21220,SHAUN YUDELSON,16
15654,XKANDAL KREATIONZ,16
2947,KLEINFELDER CONSTRUCTION SERVICES,17
29947,JIPSON LLC,17
55857,EDUCATION-2U,18
60493,NORTH PARK SUSHI,18
53486,SOCAL CONNECTOR CONSULTING,18


Unnamed: 0,dba_name,dist_focal_jacc
33360,KLEINFELDER INC,0.285714
2947,KLEINFELDER CONSTRUCTION SERVICES,0.294118
31425,DURAN FREIGHT CORPORATION,0.294118
53486,SOCAL CONNECTOR CONSULTING,0.411765
61280,NICHOLAS LUNG SPORTS TRAINING,0.444444
24557,CH2M HILL KLEINFELDER & A JOINT VENTURE,0.454545
15654,XKANDAL KREATIONZ,0.5
40248,LAVERTY DESIGNS INC,0.526316
60493,NORTH PARK SUSHI,0.555556
29947,JIPSON LLC,0.5625


In [299]:
## jaro is similarity score so 1 - that
sd_biz_lookin['dist_focal_jaro'] = [1-distance.get_jaro_distance(focal_ppp_cleaner, other_name,
                                                              winkler = True, scaling = 0.1)
                     for other_name in sd_biz_lookin.dba_name]

sd_biz_lookin[['dba_name', 'dist_focal_jaro']].sort_values(by = 'dist_focal_jaro')

Unnamed: 0,dba_name,dist_focal_jaro
33360,KLEINFELDER INC,0.06
2947,KLEINFELDER CONSTRUCTION SERVICES,0.14
40248,LAVERTY DESIGNS INC,0.31
18184,BRIAN CROWER INC,0.35
31425,DURAN FREIGHT CORPORATION,0.36
61280,NICHOLAS LUNG SPORTS TRAINING,0.4
15654,XKANDAL KREATIONZ,0.41
24557,CH2M HILL KLEINFELDER & A JOINT VENTURE,0.43
60493,NORTH PARK SUSHI,0.46
21220,SHAUN YUDELSON,0.48


### Step 2-- rule out potential matches with different zip codes

"Blocking" on 6-digit zip code, or requiring an exact match

In [300]:
## first, we clean up the SD zip codes to only be 6 dig since we know our focal ppp biz
## has a 6-dig zip code
sd_biz_lookin['zip_6dig'] = sd_biz_lookin.address_zip.str.replace("\-.*", "", regex = True)

## get the zip- using iloc since we just want it as a string
## rather than series
focal_ppp_zip = ppp_keep.BorrowerZip[ppp_keep.BorrowerName == "THE KLEINFELDER GROUP, INC."].iloc[0]
focal_ppp_zip



'92101'

In [301]:
## create true false if same as focal biz
sd_biz_lookin['is_match_zip'] = np.where(sd_biz_lookin.zip_6dig == focal_ppp_zip,
                                        True, False)

sd_biz_lookin.loc[sd_biz_lookin.is_match_zip,
             sd_helpfulfields]

Unnamed: 0,dba_name,naics_code,naics_sector,address_no,address_pd,address_road,address_sfx,address_city,address_zip,date_cert_effective,date_cert_expiration,business_owner_name
2947,KLEINFELDER CONSTRUCTION SERVICES,54161,54,550,W,C,ST,SAN DIEGO,92101-3532,2020-12-01 12:00:00,2021-11-30 12:00:00,KLEINFELDER CONSTRUCTION SERVICES INC
24557,CH2M HILL KLEINFELDER & A JOINT VENTURE,54199,54,402,W,BROADWAY,,SAN DIEGO,92101-8544,2020-10-01 23:59:59,2021-09-30 23:59:59,CH2M HILL INC & KLEINFELDER WEST INC
33360,KLEINFELDER INC,541615,54,550,W,C,ST,SAN DIEGO,92101-3532,2020-07-01 23:59:59,2021-06-30 23:59:59,KLEINFELDER INC
40248,LAVERTY DESIGNS INC,54141,54,301,W,G,ST,SAN DIEGO,92101-6095,2020-07-01 23:59:59,2021-06-30 23:59:59,LAVERTY DESIGNS INC
17151,THE SALVATION ARMY ARC,81356,81,1335,,BROADWAY,,SAN DIEGO,92101-5708,2020-06-01 00:00:00,2021-05-31 00:00:00,THE SALVATION ARMY ARC


### Step 3: construct some match score

Record linkage methods have different ways for aggregating across fields

Here, we're going with a simple one of:

- Need to match the zip code of the focal Kleinfelder group directly
- Within those, find the average of the jarowinkler and jaccard string distance measures (we're excluding edit distance from that avg since on diff scale)

Whichever has the lowest average of two we consider the best match

In [303]:
string_dist_fields = [col for col in sd_biz_lookin.columns if "dist_" in col and 
                     "edit" not in col]
string_dist_fields
mean_distances = sd_biz_lookin[string_dist_fields].mean(axis = 1)

mean_distances[0:5]

sd_biz_lookin['mean_string_dist'] = mean_distances

sd_biz_lookin.loc[sd_biz_lookin.is_match_zip,
                 sd_helpfulfields + ['mean_string_dist']].sort_values(by = "mean_string_dist")

## would go with kleinfelder inc and maybe also
## the construction services

['dist_focal_jacc', 'dist_focal_jaro']

2947     0.217059
24557    0.442273
33360    0.172857
21220    0.545556
23703    0.588529
dtype: float64

Unnamed: 0,dba_name,naics_code,naics_sector,address_no,address_pd,address_road,address_sfx,address_city,address_zip,date_cert_effective,date_cert_expiration,business_owner_name,mean_string_dist
33360,KLEINFELDER INC,541615,54,550,W,C,ST,SAN DIEGO,92101-3532,2020-07-01 23:59:59,2021-06-30 23:59:59,KLEINFELDER INC,0.172857
2947,KLEINFELDER CONSTRUCTION SERVICES,54161,54,550,W,C,ST,SAN DIEGO,92101-3532,2020-12-01 12:00:00,2021-11-30 12:00:00,KLEINFELDER CONSTRUCTION SERVICES INC,0.217059
40248,LAVERTY DESIGNS INC,54141,54,301,W,G,ST,SAN DIEGO,92101-6095,2020-07-01 23:59:59,2021-06-30 23:59:59,LAVERTY DESIGNS INC,0.418158
24557,CH2M HILL KLEINFELDER & A JOINT VENTURE,54199,54,402,W,BROADWAY,,SAN DIEGO,92101-8544,2020-10-01 23:59:59,2021-09-30 23:59:59,CH2M HILL INC & KLEINFELDER WEST INC,0.442273
17151,THE SALVATION ARMY ARC,81356,81,1335,,BROADWAY,,SAN DIEGO,92101-5708,2020-06-01 00:00:00,2021-05-31 00:00:00,THE SALVATION ARMY ARC,0.554524


# That was a lot of steps. How can we use a package to automate a bit?

Google "fuzzy matching" or "probablistic record linkage" packages in python

Here, we'll focus on 

- recordlinkage. Documentation: https://recordlinkage.readthedocs.io/en/latest/notebooks/link_two_dataframes.html



## Step 1. Define dataframes to match

Here, we'll use two dataframes:

- The sd_lookin dataframe we've been working with 
- A ppp dataframe with (1) our focal business, (2) a small random sample of others, (3) the biz we know has an exact match

In [304]:
## df 2
ppp_lookin = pd.concat([ppp_keep[ppp_keep.BorrowerName == focal_ppp_raw].copy(),
                       ppp_keep[ppp_keep.BorrowerName != focal_ppp_raw].sample(n = 10, random_state = 42),
                       ppp_keep[ppp_keep.BorrowerName == "DURAN FREIGHT CORPORATION"]])


## clean name similarly to how we did before
ppp_lookin['bname_clean'] = [re.sub(r"[^\w\s]", "", one_n) for one_n in ppp_lookin.BorrowerName]
ppp_lookin[['BorrowerName', 'bname_clean']].head()

## clean zip so that 6 digits
ppp_lookin['zip_4match'] = ppp_lookin.BorrowerZip.astype(str).str.replace("\-.*", "", regex = True)
ppp_lookin[['BorrowerZip', 'zip_4match']].head()

## in exercise, you'll clean address and naics codes 


Unnamed: 0,BorrowerName,bname_clean
34514,"THE KLEINFELDER GROUP, INC.",THE KLEINFELDER GROUP INC
109290,FOXFURY LLC,FOXFURY LLC
84291,PASTOR OF SAINT MARTIN OF TOURS CATHOLIC PARIS...,PASTOR OF SAINT MARTIN OF TOURS CATHOLIC PARIS...
91230,HAYWOODS UNLIMITED PROMOTIONS INC.,HAYWOODS UNLIMITED PROMOTIONS INC
100349,"BOARDWALK F&B, LLC",BOARDWALK FB LLC


Unnamed: 0,BorrowerZip,zip_4match
34514,92101,92101
109290,92056,92056
84291,91941,91941
91230,92128-3609,92128
100349,90046-2416,90046


## Step 2: for ease of use, standardize colnames for the fields we'll use

In this practice exercise, we'll use:

- Fuzzy match on business name
- Exact match on 6-digit zip code

We only need to standardize the name of the exact match field, but are here just standardizing all for ease of use

In [305]:
## define rename dictionary for sd_biz and rename saving to new (just for convenience to not reload if we want to
## change earlier step)
newcols_sd = {'dba_name': 'bizname_4match',
           'zip_6dig': 'zip_4match'}

sd_4match = sd_biz_lookin.rename(columns = newcols_sd, inplace = False)

sd_4match[[col for col in sd_4match.columns if "4match" in col]].head()


## same for ppp data
newcols_ppp = {'bname_clean': 'bizname_4match'}

ppp_4match = ppp_lookin.rename(columns = newcols_ppp, inplace = False)

ppp_4match[[col for col in ppp_4match.columns if "4match" in col]].head()


Unnamed: 0,bizname_4match,zip_4match
2947,KLEINFELDER CONSTRUCTION SERVICES,92101
24557,CH2M HILL KLEINFELDER & A JOINT VENTURE,92101
33360,KLEINFELDER INC,92101
21220,SHAUN YUDELSON,92103
23703,CRAIG FRAHM,92130


Unnamed: 0,bizname_4match,zip_4match
34514,THE KLEINFELDER GROUP INC,92101
109290,FOXFURY LLC,92056
84291,PASTOR OF SAINT MARTIN OF TOURS CATHOLIC PARIS...,91941
91230,HAYWOODS UNLIMITED PROMOTIONS INC,92128
100349,BOARDWALK FB LLC,90046


## Step 3: initialize the match object and tell it if anything to "block on" or exact match

Here, we're blocking on zip

In [306]:
## initialize indexer
my_recordmatcher = recordlinkage.Index()
print(type(my_recordmatcher))

## tell it what to block on (skip if not blocking on anything)
my_recordmatcher.block("zip_4match")



<class 'recordlinkage.api.Index'>


<Index>

## Step 4: create candidate links based on that blocking variable

In [307]:
## then, feed the record matcher the two datasets (must have that blocking variable)
## this will create candidate_links that are exact matches on those
candidate_links_zip = my_recordmatcher.index(sd_4match, ppp_4match)
candidate_links_zip

print(type(candidate_links_zip))

## see that it's a list of tuples and first element in tuple is index
## of first df we feed it; second is index in second df we feed it

## print example of links
sd_4match.loc[sd_4match.index == 2947,
         [col for col in sd_4match.columns if "4match" in col]]
ppp_4match.loc[ppp_4match.index.isin([34514, 112928]),
        [col for col in ppp_4match.columns if "4match" in col]]

MultiIndex([( 2947,  34514),
            ( 2947, 112928),
            (24557,  34514),
            (24557, 112928),
            (33360,  34514),
            (33360, 112928),
            (40248,  34514),
            (40248, 112928),
            (17151,  34514),
            (17151, 112928),
            (29947,  80338),
            (29947,  88795),
            (31425,  80338),
            (31425,  88795)],
           )

<class 'pandas.core.indexes.multi.MultiIndex'>


Unnamed: 0,bizname_4match,zip_4match
2947,KLEINFELDER CONSTRUCTION SERVICES,92101


Unnamed: 0,bizname_4match,zip_4match
34514,THE KLEINFELDER GROUP INC,92101
112928,GLOBE HARU INC,92101


## Step 5- initialize Compare and define fuzzy fields and threshold for each

Note in documentation about diff string compare methods:
This class is used to compare string values. The implemented algorithms are: ‘jaro’,’jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’ or ‘cosine’. In case of agreement, the similarity is 1 and in case of complete disagreement it is 0. The Python Record Linkage Toolkit uses the jellyfish package for the Jaro, Jaro-Winkler, Levenshtein and Damerau- Levenshtein algorithms.

In [308]:
compare = recordlinkage.Compare()

thres_bizname = 0.7
compare.string('bizname_4match', 'bizname_4match', method='jaro', threshold=thres_bizname)

print(type(compare))

<Compare>

<class 'recordlinkage.api.Compare'>


## Step 6- using the compare Class and the candidate links, compute comparisons

In [310]:
compare_vectors = compare.compute(candidate_links_zip, sd_4match, ppp_4match)
print(type(compare_vectors))

compare_vectors

## returns result from comparing each pair of records - so we see that with the 2497
## example above (kleinfield construction with naics 54161), 
## which has candidate pairs of (1) Kneinfelder group naics code 541330 (index 34514)
## and (2) globe haru naics code 722511 (index 112928), there seems to be a match on name
## with the first in the pair

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Unnamed: 1,0
2947,34514,0.0
2947,112928,0.0
24557,34514,0.0
24557,112928,0.0
33360,34514,1.0
33360,112928,0.0
40248,34514,0.0
40248,112928,0.0
17151,34514,0.0
17151,112928,0.0


## Step 7. decide what counts as a true match

Three general approaches:

- Threshold based: look at the raw scores and determine what scores are above a threshold
- Unsupervised: something that clusters the pairs into "likely match" or "likely not match" but where we're not feeding it "labels" corresponding to true matches
- Supervised: we have some gold-standard label dataset that has an indicator for whether records are true matches; we train a model on those true matches and generalize to new cases

See here for many classifiers: https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html

Here, we're using unsupervised and k-means clustering algorithm

Other option is an EM-based classifier initialized as follows, but not enough data here to fit:
ecm = recordlinkage.ECMClassifier()
  


In [311]:
## initialize classifier
kmeans = recordlinkage.KMeansClassifier()
kmeans_results = kmeans.fit_predict(compare_vectors)
print(type(kmeans_results))
kmeans_results


<class 'pandas.core.indexes.multi.MultiIndex'>


MultiIndex([(33360, 34514),
            (31425, 80338),
            (31425, 88795)],
           )

## Step 8- extract pairs using indices and summarize

In [312]:
## since sd was our left hand side data, they're 
## the first index in the tuple- extract
indices_sd = [x[0] for x in kmeans_results]

## since ppp loans were our right hand side data, they're
## the second index in the tuple - extract
indices_ppp = [x[1] for x in kmeans_results]

## create dataframe
df_matchpairs = pd.DataFrame({'sd_indices': indices_sd,
                'ppp_indices': indices_ppp})

df_matchpairs

## add indices as col to orig data
sd_4match['index_4merge'] = sd_4match.index
ppp_4match['index_4merge'] = ppp_4match.index

## then, join matches

### first, i'm joining the sd info
df_matchpairs_wsd = pd.merge(df_matchpairs,
                            sd_4match[['index_4merge', 'bizname_4match',
                                      'zip_4match']],
                            how = "left",
                            left_on = "sd_indices",
                            right_on = "index_4merge")

df_matchpairs_wsd

## then, i'm joining the ppp info and adding a suffix to distinguish the vars
df_matchpairs_wboth = pd.merge(df_matchpairs_wsd,
                              ppp_4match[['index_4merge', 'bizname_4match',
                                         'zip_4match']],
                              how = "left",
                              left_on = "ppp_indices",
                              right_on = "index_4merge",
                              suffixes= ["_sd_tax", "_ppp"])

df_matchpairs_wboth


Unnamed: 0,sd_indices,ppp_indices
0,33360,34514
1,31425,80338
2,31425,88795


Unnamed: 0,sd_indices,ppp_indices,index_4merge,bizname_4match,zip_4match
0,33360,34514,33360,KLEINFELDER INC,92101
1,31425,80338,31425,DURAN FREIGHT CORPORATION,92154
2,31425,88795,31425,DURAN FREIGHT CORPORATION,92154


Unnamed: 0,sd_indices,ppp_indices,index_4merge_sd_tax,bizname_4match_sd_tax,zip_4match_sd_tax,index_4merge_ppp,bizname_4match_ppp,zip_4match_ppp
0,33360,34514,33360,KLEINFELDER INC,92101,34514,THE KLEINFELDER GROUP INC,92101
1,31425,80338,31425,DURAN FREIGHT CORPORATION,92154,80338,DURAN FREIGHT CORPORATION,92154
2,31425,88795,31425,DURAN FREIGHT CORPORATION,92154,88795,DURAN FREIGHT CORPORATION,92154


# Activity

- First, get the hang of the above by changing around the string distance threshold and seeing if you get more or fewer matches
- Then, clean the address fields in the respective data sets and add that as an additional string field to the compare step. Does that change matches? Why or why not?
- Finally, go back to the original data and take a random sample of ~200 rows of the SD tax certificates data and the PPP loans data. Put the above steps into a function that takes each dataframe as an argument, an argument for what variable to block on, and an argument for a string field to treat fuzzily and a threshold field. Play around with matches using different variables to block on between zip code, city name, 2-digit naics sector, etc. As a warning, you may not get any matches with a random sample so may want to construct a targeted sample using a few ones you know have matches
- **Challenge exercise**: make the function general enough so that it can take in multiple string arguments to potentially fuzzy match on 

