# Imports 

In [1]:
import pandas as pd
import re 
import numpy as np
import datetime
from datetime import datetime

## a couple recordlinkage packages
import recordlinkage

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
## modules still being installed on jhub
## nltk for string distance
import nltk

## jarowinkler
from pyjarowinkler import distance

# Load raw data 

In [3]:
## general link: https://data.sandiego.gov/datasets/business-listings/

## active tax certificates
sd = pd.read_csv("https://seshat.datasd.org/ttcs/sd_businesses_active_datasd.csv")



CA     127150
TX      74279
NY      71696
FL      58508
IL      40116
PA      37675
OH      31980
NJ      31620
MI      29242
MA      26863
GA      25170
WA      23197
VA      22797
NC      22292
CO      19386
MN      19336
MD      18562
WI      17119
IN      16323
MO      16003
TN      15548
AZ      15400
LA      13375
OR      13195
CT      12613
AL      10672
SC      10496
OK       9833
UT       9102
KY       9001
IA       8144
NV       8039
KS       8036
AR       5747
NE       5695
MS       5347
NH       4982
HI       4894
NM       4392
ID       4324
DC       4241
ME       4129
RI       3720
WV       3285
ND       3162
MT       3093
DE       2921
PR       2817
SD       2647
AK       2599
VT       2234
WY       2114
GU        425
VI        272
MP         80
AS         18
NaN        14
Name: BorrowerState, dtype: int64

(17196, 51)

In [4]:
## PPP loans subsetted to CA and sd zip
ppp = pd.read_csv("https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/6b62a44b-69ec-436a-9b95-0ea550475543/download/public_150k_plus.csv")
ppp.BorrowerState.value_counts(dropna = False)
ppp = ppp[(ppp.BorrowerState == "CA") &
                  (ppp.BorrowerZip.isin(sd.address_zip))].copy()
ppp.shape

CA     127150
TX      74279
NY      71696
FL      58508
IL      40116
PA      37675
OH      31980
NJ      31620
MI      29242
MA      26863
GA      25170
WA      23197
VA      22797
NC      22292
CO      19386
MN      19336
MD      18562
WI      17119
IN      16323
MO      16003
TN      15548
AZ      15400
LA      13375
OR      13195
CT      12613
AL      10672
SC      10496
OK       9833
UT       9102
KY       9001
IA       8144
NV       8039
KS       8036
AR       5747
NE       5695
MS       5347
NH       4982
HI       4894
NM       4392
ID       4324
DC       4241
ME       4129
RI       3720
WV       3285
ND       3162
MT       3093
DE       2921
PR       2817
SD       2647
AK       2599
VT       2234
WY       2114
GU        425
VI        272
MP         80
AS         18
NaN        14
Name: BorrowerState, dtype: int64

(17196, 51)

# Activity

- Clean the address fields in the respective data sets 
- Clean the naics code fields in the data to extract the first two digits
- Take a random sample of ~200 rows of each of the datasets (sd and ppp). Write code to fuzzy match using different variables to block on between zip code, city name, 2-digit naics sector, etc. As a warning, you may not get any matches with a random sample so may want to construct a targeted sample using a few ones you know have matches
- If you haven't already, put the steps in the recordlinkage process into a function
- **Challenge exercise**: make the function general enough so that it can take in multiple string arguments to potentially fuzzy match on



## Step 1 and 2- preprocess join fields

### Cleaning ppp address cols

In [38]:
## view random sample of ppp rows
ppp_addcols = ["BorrowerAddress", "BorrowerCity", "BorrowerState", "BorrowerZip"]
ppp[ppp_addcols].sample(n = 10, random_state = 666)

## see that (1) 5-digit versus 9 dig zip code
## and (2) variable capitalization

## first, strip zip code to be first 5 dig
ppp['BorrowerZip'] = ppp.BorrowerZip.str.replace("\-.*", "", regex = True)

## first, convert those cols to upper
## using apply to do less manually
ppp[ppp_addcols] = ppp[ppp_addcols].apply(lambda x: x.astype(str).str.upper())
ppp[ppp_addcols].sample(n = 10, random_state = 666)


Unnamed: 0,BorrowerAddress,BorrowerCity,BorrowerState,BorrowerZip
126794,1758 JUNCTION AVENUE,SAN JOSE,CA,95112
66559,1551 E. 4TH ST.,ONTARIO,CA,91764
55792,2075 SAN JOAQUIN HILLS RD,NEWPORT BEACH,CA,92660
128905,1741 EASTLAKE PKWY STE 102 #224,CHULA VISTA,CA,91915
55888,1761 REYNOLDS AVE.,IRVINE,CA,92614
72564,5 MASON,IRVINE,CA,92618
90507,11107 ROSELLE ST STE 211,SAN DIEGO,CA,92121
47338,5151 SHOREHAM PL STE 260,SAN DIEGO,CA,92122
107918,15546 PASEO JENGHIZ,SAN DIEGO,CA,92129
125776,1340 E VALLEY PKWY,ESCONDIDO,CA,92027


Unnamed: 0,BorrowerAddress,BorrowerCity,BorrowerState,BorrowerZip
126794,1758 JUNCTION AVENUE,SAN JOSE,CA,95112
66559,1551 E. 4TH ST.,ONTARIO,CA,91764
55792,2075 SAN JOAQUIN HILLS RD,NEWPORT BEACH,CA,92660
128905,1741 EASTLAKE PKWY STE 102 #224,CHULA VISTA,CA,91915
55888,1761 REYNOLDS AVE.,IRVINE,CA,92614
72564,5 MASON,IRVINE,CA,92618
90507,11107 ROSELLE ST STE 211,SAN DIEGO,CA,92121
47338,5151 SHOREHAM PL STE 260,SAN DIEGO,CA,92122
107918,15546 PASEO JENGHIZ,SAN DIEGO,CA,92129
125776,1340 E VALLEY PKWY,ESCONDIDO,CA,92027


### Cleaning sd address cols

In [47]:
sd_address = ['address_no', 'address_pd', 'address_road', 'address_sfx', 'address_city', 
             'address_state', 'address_zip']

sd[sd_address].head()

## truncate zip
sd['BorrowerZip'] = sd.address_zip.str.replace("\-.*", "", regex = True)

## replace NA for those with ""
sd[sd_address] = sd[sd_address].fillna("")

## do more manual truncation since no, pd, road, and sfx are
## separated by whitespace; others are comma
sd['address_merged'] = sd[['address_no', 'address_pd', 'address_road', 
                               'address_sfx']].agg(' '.join, axis=1) 

sd[sd_address + ['address_merged']].head()
                            
## leaving city, state, and zip separate

Unnamed: 0,address_no,address_pd,address_road,address_sfx,address_city,address_state,address_zip
0,9655,,GRANITE RIDGE,DR,SAN DIEGO,CA,92123
1,1168,,FESLER,ST,EL CAJON,CA,92020
2,8666,,COMMERCE,AVE,SAN DIEGO,CA,92121
3,10101,,OLD GROVE,RD,SAN DIEGO,CA,92131
4,1,,VISION,WAY,BLOOMFIELD,CT,6002


Unnamed: 0,address_no,address_pd,address_road,address_sfx,address_city,address_state,address_zip,address_merged
0,9655,,GRANITE RIDGE,DR,SAN DIEGO,CA,92123,9655 GRANITE RIDGE DR
1,1168,,FESLER,ST,EL CAJON,CA,92020,1168 FESLER ST
2,8666,,COMMERCE,AVE,SAN DIEGO,CA,92121,8666 COMMERCE AVE
3,10101,,OLD GROVE,RD,SAN DIEGO,CA,92131,10101 OLD GROVE RD
4,1,,VISION,WAY,BLOOMFIELD,CT,6002,1 VISION WAY


### Create 2-dig naics codes

In [45]:
## already exists for sd
sd[[col for col in sd.columns if "naics" in col]]
print('NAICS 2-dig in SD are:-----------------')
sd.naics_sector.value_counts(dropna = False)

## for ppp- truncate to first 2dig
ppp['naics_sector'] = ppp.NAICSCode.astype(str).str[0:2]
print('NAICS 2-dig in PPP are:-----------------')
ppp.naics_sector.value_counts(dropna = False)

Unnamed: 0,naics_sector,naics_code,naics_description
0,44,442,FURNITURE & HOME FURNISHINGS STORES
1,23,23511,"PLUMBING, HEATING & AC CONTRACTOR"
2,42,42199,OTHER MISCELLANEOUS DURABLE GOODS WHSLE
3,42,42199,OTHER MISCELLANEOUS DURABLE GOODS WHSLE
4,45,4539,OTHER MISCELLANEOUS STORE RETAILERS
...,...,...,...
64755,54,541615,CONSULTING SERVICES
64756,54,54193,TRANSLATION & INTERPRETATION SERVICES
64757,56,56199,ALL OTHER SUPPORT SERVICES
64758,54,54199,"ALL OTH PROF, SCIENTIFIC & TECHNICAL SERVICES"


NAICS 2-dig in SD are:-----------------


54    13386
81     9616
56     5038
45     4961
62     4887
23     4839
44     3690
72     3621
53     3194
42     2051
48     1956
61     1795
71     1475
52     1219
33      846
51      802
32      325
49      315
31      311
55      235
11      117
22       68
92       11
21        2
Name: naics_sector, dtype: int64

NAICS 2-dig in PPP are:-----------------


23    2687
54    2636
72    1999
62    1319
33    1298
42    1005
56     962
81     800
44     590
53     458
45     411
51     411
48     387
32     382
na     362
52     333
61     266
99     209
31     208
71     203
11      97
49      43
22      42
92      40
55      29
21      19
Name: naics_sector, dtype: int64

## Clean business name by converting to upper and removing some puncutation

Here, i'm stripping out the , and . but keeping & since it may be relevant

In [56]:
print("Before cleaning names:-----------------")
sd.dba_name.sample(10, random_state = 4)
ppp.BorrowerName.sample(10, random_state = 4)

sd['clean_name'] = [re.sub(r"\,|\.", "", name) for name in sd.dba_name]
ppp['clean_name'] = [re.sub(r"\,|\.", "", name) for name in ppp.BorrowerName]

print("After cleaning names:-----------------")
sd.clean_name.sample(10, random_state = 4)
ppp.clean_name.sample(10, random_state = 4)

Before cleaning names:-----------------


25645                    WHITE DOG INTERIORS
48196                     CAMP RUN A MUTT SV
41898                ADVANTAGE AMBULANCE INC
29977            EXECUTIVE SEARCH RECRUITING
46286                         ALOHA PLUMBING
38092                      ROYAL MOTORSPORTS
3486     INTERNATIONAL PRIME INGREDIENTS INC
3263            ANNS ALTERATIONS & TAILORING
52205           APNEA & BREATHING CLINIC LLC
32260                     PAUL FRYE WEDDINGS
Name: dba_name, dtype: object

120501            ANDREWS BROTHERS CONCRETE INC
55645                           POOLMASTER INC.
142028            SAME DAY EXPRESS DELIVERY INC
82958          MINDFIRE INTERNET SOLUTIONS, INC
119802                         BEST TRAILER INC
77497               WORK COMP RESOLUTIONS, INC.
161336         DOUGLAS TECHNOLOGIES GROUP, INC.
136913           LAKSHMI HOSPITALITY GROUP, LLC
116350    URBAN FUTURES BOND ADMINISTRATION INC
120059         M &#X26; S BUILDING SUPPLY, INC.
Name: BorrowerName, dtype: object

After cleaning names:-----------------


25645                    WHITE DOG INTERIORS
48196                     CAMP RUN A MUTT SV
41898                ADVANTAGE AMBULANCE INC
29977            EXECUTIVE SEARCH RECRUITING
46286                         ALOHA PLUMBING
38092                      ROYAL MOTORSPORTS
3486     INTERNATIONAL PRIME INGREDIENTS INC
3263            ANNS ALTERATIONS & TAILORING
52205           APNEA & BREATHING CLINIC LLC
32260                     PAUL FRYE WEDDINGS
Name: clean_name, dtype: object

120501            ANDREWS BROTHERS CONCRETE INC
55645                            POOLMASTER INC
142028            SAME DAY EXPRESS DELIVERY INC
82958           MINDFIRE INTERNET SOLUTIONS INC
119802                         BEST TRAILER INC
77497                 WORK COMP RESOLUTIONS INC
161336           DOUGLAS TECHNOLOGIES GROUP INC
136913            LAKSHMI HOSPITALITY GROUP LLC
116350    URBAN FUTURES BOND ADMINISTRATION INC
120059           M &#X26; S BUILDING SUPPLY INC
Name: clean_name, dtype: object

## Step 2 - randomly sample rows and try fuzzy matching

In [91]:
## shift to larger sample out of SD since that's the pool
## we're searching for PPP in
ppp_samp = ppp.sample(n = 200, random_state = 566)
sd_samp = sd.sample(n = 20000, random_state = 566)

## here, i'm blocking on zip code and fuzzily matching on 
## business name and street name  

### step 1- init recordlinker
link_ppp_sd = recordlinkage.Index()

### step 2 - tell it what to block on 
link_ppp_sd.block("BorrowerZip")

### step 3- compute candidate links
candidate_links_zipcode = link_ppp_sd.index(sd_samp, ppp_samp)
candidate_links_zipcode

## step 3 under the hood - pull out example
## see that they have the same zip but unlikely to be matches
sd_samp.loc[sd_samp.index == 7201,
           ["clean_name", "BorrowerZip"]]
ppp_samp.loc[ppp_samp.index.isin([137429, 84332]),
           ["clean_name", "BorrowerZip"]]

### step 4- create compare class and add fuzzy strings
compare = recordlinkage.Compare()
compare.string('clean_name', 'clean_name', method='jarowinkler', threshold=0.7)
compare.string('address_merged', 'BorrowerAddress', method = "jarowinkler", threshold = 0.9)



<Index>

MultiIndex([( 3730,  72883),
            ( 3711,  72883),
            (25969,  72883),
            ( 8334,  72883),
            (17022,  72883),
            (29750,  72883),
            ( 2943,  72883),
            (26489,  72883),
            (58605,  72883),
            (22939,  72883),
            ...
            (20316,  42434),
            (16777,  55381),
            (16777, 126098),
            (16777,  37890),
            (16777, 108128),
            (60319,  55381),
            (60319, 126098),
            (60319,  37890),
            (60319, 108128),
            (15899,  45820)],
           length=39206)

Unnamed: 0,clean_name,BorrowerZip
7201,BRANDMAN UNIVERSITY,92618


Unnamed: 0,clean_name,BorrowerZip
137429,SOUTHERN FIREPLACE DISTRIBUTORS INC,92618
84332,ADVANCED INDUSTRIAL SOLUTIONS INC,92618


<Compare>

<Compare>

In [92]:
## step 5- feed the candidate links to the compare class and compute
compare_vectors = compare.compute(candidate_links_zipcode, sd_samp, ppp_samp)
compare_vectors

## step 5 under the hood - pull out example
## of biz with matching (non-zero) address col
## and name col
compare_vectors.columns = ["name", "address"]
compare_vectors[(compare_vectors.name == 1) &
               (compare_vectors.address == 1)]


Unnamed: 0,Unnamed: 1,0,1
3730,72883,0.0,0.0
3711,72883,0.0,0.0
25969,72883,0.0,0.0
8334,72883,0.0,0.0
17022,72883,0.0,0.0
...,...,...,...
60319,55381,0.0,0.0
60319,126098,0.0,1.0
60319,37890,0.0,0.0
60319,108128,0.0,0.0


Unnamed: 0,Unnamed: 1,name,address
14769,68600,1.0,1.0
54583,68600,1.0,1.0
3397,114033,1.0,1.0
18775,89263,1.0,1.0
6014,89263,1.0,1.0
43620,49624,1.0,1.0


In [93]:
### step 5 under the hood - example of similar-enough name
example_id = 14769
sd_samp.loc[sd_samp.index == example_id,
           ["clean_name", "BorrowerZip", "address_merged"]]
ppp_samp.loc[ppp_samp.index.isin([68600]),
           ["clean_name", "BorrowerZip", "BorrowerAddress"]]

## yay looks like a true match!

## step 6 -- algorithm to aggregate across fields
## example code has k-means
## here, we're using e-m alg
## and transforming it to a df
## sd_index is on left 
ecm = recordlinkage.ECMClassifier()
predicted_matches_ecm = pd.DataFrame(list(ecm.fit_predict(compare_vectors)),
                                     columns = ['sd_index', 'ppp_index'])
print("Matches returned are:------------")
predicted_matches_ecm


Unnamed: 0,clean_name,BorrowerZip,address_merged
14769,LATITUDE PHARMACEUTICALS INC,92131,9675 BUSINESSPARK AVE


Unnamed: 0,clean_name,BorrowerZip,BorrowerAddress
68600,LATITUDE PHARMACEUTICALS INC,92131,9675 BUSINESSPARK AVE


Matches returned are:------------


Unnamed: 0,sd_index,ppp_index
0,14769,68600
1,54583,68600
2,3397,114033
3,18775,89263
4,6014,89263
5,43620,49624


In [99]:
## step 7 - merge the two to compare
### merge to sd data
sd_samp['sd_index'] = sd_samp.index
m1_addsd = pd.merge(predicted_matches_ecm,
                   sd_samp[['clean_name', 'sd_index', 'address_merged', "naics_sector"]],
                   on = "sd_index",
                   how = "inner")
ppp_samp['ppp_index'] = ppp_samp.index
m2_addp = pd.merge(m1_addsd, 
                  ppp_samp[['clean_name', 'ppp_index', 'BorrowerAddress', 'naics_sector']],
                  on = "ppp_index",
                  how = "inner",
                  suffixes = ["_SDtaxdata", "_PPPloandata"])

## see some true matches and some false matches (e.g., adaptive launch solutions
## is a false match; habitat restoration sciences is a false match)
## would then want to adjust string threshold
## or possibly add naics code as field
m2_addp

Unnamed: 0,sd_index,ppp_index,clean_name_SDtaxdata,address_merged,naics_sector_SDtaxdata,clean_name_PPPloandata,BorrowerAddress,naics_sector_PPPloandata
0,14769,68600,LATITUDE PHARMACEUTICALS INC,9675 BUSINESSPARK AVE,54,LATITUDE PHARMACEUTICALS INC,9675 BUSINESSPARK AVE,54
1,54583,68600,ADAPTIVE LAUNCH SOLUTIONS,9975 BUSINESSPARK AVE,54,LATITUDE PHARMACEUTICALS INC,9675 BUSINESSPARK AVE,54
2,3397,114033,A N B AUTO REPAIR INC,3502 UNIVERSITY AVE,81,CA BREAKFAST REPUBLIC INC,2730 UNIVERSITY AVE,72
3,18775,89263,HABITAT RESTORATION SCIENCES INC,1217 DISTRIBUTION WAY,23,GEOGRID RETAINING WALL SYSTEMS INC,1295 DISTRIBUTION WAY,23
4,6014,89263,GEOGRID RETAINING WALL SYSTEMS INC,1295 DISTRIBUTION WAY,23,GEOGRID RETAINING WALL SYSTEMS INC,1295 DISTRIBUTION WAY,23
5,43620,49624,INTEGRITY BUILDERS GENERAL CONTRACTORS INC,866 W 18TH ST,23,INTEGRITY BUILDERS GENERAL CONTRACTORS INC,866 W 18TH ST,23


## Example of more manual versus more automatic way of concatenating address columns


In [None]:

## more manual way
ppp['address_merged_manual'] = ppp.BorrowerAddress + ", " + ppp.BorrowerCity + ", " + ppp.BorrowerState + ", " + ppp.BorrowerZip
ppp[ppp_addcols + ['address_merged_manual']].head()

### more automatic using agg and join
ppp['address_merged_auto'] = ppp[ppp_addcols].agg(', '.join, axis=1)
ppp[["address_merged_manual", "address_merged_auto"]].head()