# Imports 

In [1]:
import pandas as pd
import re 
import numpy as np
import os

import recordlinkage

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Run code to load data

Reminder: if you moved this activity notebook, you'll need to change the pathname since the relative pathname below relates to the folder structure of our class repo; recommend using the `os.getcwd()` command if lost

In [2]:
sd = pd.read_csv("../../public_data/sd_forfuzzy.csv")
ppp = pd.read_csv("../../public_data/ppploans_forfuzzy.csv")
sd.columns
ppp.columns

Index(['dba_name', 'business_owner_name', 'naics_code', 'address_no',
       'address_pd', 'address_road', 'address_sfx', 'address_city',
       'address_zip', 'zip_6dig'],
      dtype='object')

Index(['BorrowerName', 'BorrowerAddress', 'BorrowerCity', 'BorrowerZip',
       'FranchiseName', 'NAICSCode', 'BorrowerZip_6dig', 'Race', 'Ethnicity'],
      dtype='object')

# Activity

1. Clean the address fields in the respective data sets (eg replace NaN in the San Diego data address fields with "" and then paste together)


In [3]:
# your code to clean address in san diego data

## step 1: replace NA with "" 
### get address cols
sd_address = [col for col in sd.columns if "address" in col]

### replace NA for those with ""
sd[sd_address] = sd[sd_address].fillna("")

## step 2: paste together
### more manual way
sd['address_merged_manual'] = sd.address_no + " " + sd.address_pd + " " + sd.address_road + " " + sd.address_sfx 

### more automatic way- using row-wise join
### to separate by a space
sd['address_merged'] = sd[['address_no', 'address_pd', 'address_road', 
                               'address_sfx']].agg(' '.join, axis=1) 

if all(sd.address_merged_manual == sd.address_merged):
    print("same result")

same result


2. Clean the naics code fields in the datasets to extract the first two digits


In [4]:
# your code to shorten naics code to 2 digits
sd['naics_2'] = sd.naics_code.astype(str).str[0:2]
ppp['naics_2'] = ppp.NAICSCode.astype(str).str[0:2]


3. Write code to fuzzy match using different variables to block on between zip code, city name, 2-digit naics sector, etc.; you can either write this code step by step as in the codeexample notebook or consolidate into a function

In [5]:
## application 1: blocking on zip code and naics 2-dig

### rename 6-dig zip to same in both df
### doing in-place rename of PPP to 
### name in sd
ppp.rename(columns = {'BorrowerZip_6dig': 'zip_6dig'},
          inplace = True)


In [6]:
################ Step 1: blocking (exact matches) ###########
### initialize an indexer,  tell it blocking var(s), compute candidate links
indexer = recordlinkage.Index()
indexer.block(['naics_2', 'zip_6dig'])
candidate_links = indexer.index(sd, ppp)

################ Step 2: fuzzy comparison ###########
### initialize a compare class and tell it what strings
### to compare (doing address and bname here)
thres = 0.7
c = recordlinkage.Compare()
c.string('dba_name', 'BorrowerName', 
               method='jarowinkler', threshold= thres)
c.string('address_merged', 'BorrowerAddress',
              method = "jarowinkler", threshold = thres)

################ Step 3: combine exact and fuzzy ###########
compare_naicszipnameadd = c.compute(candidate_links, sd, ppp)
ecm = recordlinkage.ECMClassifier()
predicted_matches_ecm = pd.DataFrame(\
                        list(ecm.fit_predict(compare_naicszipnameadd)),
                        columns = ['sd_index', 'ppp_index'])
predicted_matches_ecm.head() # gives us indices in each data

################ Step 4: convert indices to usable data ###########
## here, we're using inner join and restricting to only matches
sd['sd_index'] = sd.index
m1_addsd = pd.merge(predicted_matches_ecm,
                   sd[['dba_name', 'sd_index',
                    'address_merged', "naics_2"]],
                   on = "sd_index",
                   how = "inner")
ppp['ppp_index'] = ppp.index
m2_addp = pd.merge(m1_addsd, 
                  ppp[['BorrowerName', 
                        'ppp_index', 'BorrowerAddress',
                        'naics_2']],
                  on = "ppp_index",
                  how = "inner",
                  suffixes = ["_SDtaxdata", "_PPPloandata"])

m2_addp[['BorrowerName', 'dba_name',
        "BorrowerAddress", 'address_merged',
        'naics_2_SDtaxdata',
        'naics_2_PPPloandata']] # see some true and some false pos

<Index>

<Compare>

<Compare>

Unnamed: 0,sd_index,ppp_index
0,170,2934
1,10,2652
2,114,2583
3,222,3707
4,95,4620


Unnamed: 0,BorrowerName,dba_name,BorrowerAddress,address_merged,naics_2_SDtaxdata,naics_2_PPPloandata
0,"THE LAW OFFICES OF RONSON J. SHAMOUN, APC",LAW OFFICES OF THOMAS T LORD,303 A ST,401 W A ST,54,54
1,IDSOLUTIONS INC,IDS GROUP INC,2763 CAMINO DEL RIO SOUTH,4445 CAMINO DEL RIO SOUTH,54,54
2,BRIERTON JONES &#X26; JONES LLP,BRIERTON JONES & JONES LLP,1550 HOTEL CIRCLE N STE 300,1550 HOTEL CIR,54,54
3,INNO TECH MANUFACTURING INC.,INNO TECH MANUFACTURING INC,10109 CARROLL CANYON RD,10109 CARROLL CANYON RD,33,33
4,"BENEFIT PRO INSURANCE SERVICES, INC.",FIRST COMMAND FINANCIAL SERVICES INC,2655 CAMINO DEL RIO,2650 CAMINO DEL RIO NORTH,52,52
5,HERITAGE TAX AND INSURANCE SERVICES INC,FIRST COMMAND FINANCIAL SERVICES INC,2650 CAMINO N 350,2650 CAMINO DEL RIO NORTH,52,52
6,SAN DIEGO GASTROENTEROLOGY MEDICAL ASSOC,BALBOA NEPHROLOGY MEDICAL GROUP INC,4060 4TH AVE STE 240,4060 04TH AVE,62,62
7,BEACH PARTNERS LLC,BEACH BUMZZ LLC,718 VENTURA PLACE,978 GARNET AVE,72,72
8,"BKM OFFICEWORKS, LLC",BKM OFFICEWORKS,4780 EASTGATE MALL #100,4780 EASTGATE MALL,42,42
9,SAN DIEGO SPORTS MEDICINE AND FAMILY HEALTH CE...,SAN DIEGO SPORTS MEDICINE & FAMILY HEALTH CNTR,6699 ALVARADO RD Ste 2100,6699 ALVARADO RD,62,62
