# Fuzzy Merge Name Matching
Supporting Mode Query: https://modeanalytics.com/editor/code_for_san_francisco/reports/33204c3dccc3

In [1]:
import os
import sys

import numpy as np
import pandas as pd
pd.options.display.max_columns = 999
import sqlalchemy as sqla
from sqlalchemy import create_engine

from pyjarowinkler import distance

DB_URI = os.getenv('CD_DWH')
engine = create_engine(DB_URI)

Jaro-Winkler distance metric was developed for name matching for the US census. Listed in environment.yaml for conda environment or `pip install pyjarowinkler`

The SQL Query in `all_candidate_names.sql` extracts first name and last name from each of the name columns in the two tables.

We wish to calculate the Jaro-Winkler distance for each of the names. 

We could limit the number of comparisons by restricting to those that have the same initials with

`...
from candidate_donations D join election_results E
on substring(D.FirstName,1,1) = substring(E.FirstName,1,1)
and substring(D.LastName,1,1) = substring(E.LastName,1,1)`

In [None]:
with engine.begin() as conn:
    iter_table = pd.read_sql_table(table_name='all_candidate_names', con=conn, schema='fuzzy_merge', chunksize=10000)
    results = pd.concat([chunk for chunk in iter_table])
results.head()

In [3]:
results = results.dropna()

In [4]:
results.count()

election_fn          136514
election_ln          136514
election_name        136514
vote_total           136514
is_winner            136514
donation_fn          136514
donation_ln          136514
candidate_name       136514
total_transaction    136514
dtype: int64

In [6]:
results['jwscore_fn'] = np.vectorize(distance.get_jaro_distance)(results.source_election_fn, results.source_donation_fn)
results['jwscore_ln'] = np.vectorize(distance.get_jaro_distance)(results.source_election_fn, results.source_donation_ln)

At this point, we can write the `results` DataFrame above directly into the DB. Below, we do some additional exploration.

In [7]:
results.head(5)

Unnamed: 0,election_fn,election_ln,election_name,vote_total,is_winner,donation_fn,donation_ln,candidate_name,total_transaction,jwscore_fn,jwscore_ln
1,kamala,harris,Kamala D. Harris,4442781.0,True,jerry,hill,"HILL, JERRY A.",666195.72,0.0,0.52
2,kamala,harris,Kamala D. Harris,4442781.0,True,georgia,acosta,"ACOSTA, GEORGIA L.",15750.0,0.44,0.56
3,kamala,harris,Kamala D. Harris,4442781.0,True,abel,guillen,"GUILLEN, ABEL",13650.0,0.61,0.44
4,kamala,harris,Kamala D. Harris,4442781.0,True,george,runner,"RUNNER, GEORGE C.",249.0,0.0,0.56
5,kamala,harris,Kamala D. Harris,4442781.0,True,steve,hill,"HILL, STEVE",9700.0,0.0,0.52


In [8]:
results[results['jwscore_fn'] > 0.9]

Unnamed: 0,election_fn,election_ln,election_name,vote_total,is_winner,donation_fn,donation_ln,candidate_name,total_transaction,jwscore_fn,jwscore_ln
404,steve,cooley,Steve Cooley,4368624.0,False,steve,hill,"HILL, STEVE",9700.00,1.00,0.47
414,steve,cooley,Steve Cooley,4368624.0,False,steve,tye,"TYE, STEVE",154183.35,1.00,0.00
548,steve,cooley,Steve Cooley,4368624.0,False,steven,choi,"CHOI, STEVEN",463763.43,0.97,0.65
578,steve,cooley,Steve Cooley,4368624.0,False,steve,neal,"NEAL, STEVE",4186.27,1.00,0.47
603,steve,cooley,Steve Cooley,4368624.0,False,steven,davis,"DAVIS, STEVEN",2986.42,0.97,0.00
610,steve,cooley,Steve Cooley,4368624.0,False,steven,bailey,"BAILEY, STEVEN",7652.00,0.97,0.67
635,steve,cooley,Steve Cooley,4368624.0,False,steven,glazer,"GLAZER, STEVEN",2016506.04,0.97,0.56
652,steve,cooley,Steve Cooley,4368624.0,False,steve,fazio,"FAZIO, STEVE",1351208.79,1.00,0.46
694,steve,cooley,Steve Cooley,4368624.0,False,steven,bradford,"BRADFORD, STEVEN",955297.40,0.97,0.43
700,steve,cooley,Steve Cooley,4368624.0,False,steve,fox,"FOX, STEVE",162738.62,1.00,0.50


Though more investigation can be conducted, let's save those that have a score above 0.9 for both first name and last name

In [9]:
match_90 = results[(results['jwscore_fn'] >= 0.9) & (results['jwscore_ln'] >=0.9)]

Then we can explore which non-perfect matches we are included and evaluate if we agree with them

In [10]:
match_90[(match_90['jwscore_fn'] < 1) | (match_90['jwscore_ln'] < 1)].head(20)

Unnamed: 0,election_fn,election_ln,election_name,vote_total,is_winner,donation_fn,donation_ln,candidate_name,total_transaction,jwscore_fn,jwscore_ln
3785,"edmund ""jerry""",brown,"Edmund G. ""Jerry"" Brown*",4388368.0,False,edmund (jerry),brown,"BROWN, EDMUND G. (JERRY)",2700.0,0.94,1.0
11837,"veronica ""roni""",jacobi,"Veronica ""Roni"" Jacobi",65355.0,False,veronica,jacobi,"JACOBI, VERONICA S.",18115.0,0.91,1.0
30670,rudy,salas,Rudy Salas,53056.0,True,rudy,salas jr.,"SALAS JR., RUDY",1181293.78,1.0,0.91
32140,vince,fong,Vince Fong,123959.0,True,vincent,fong,"FONG, VINCENT",752160.44,0.94,1.0
34628,s. monique,limón,S. Monique Limón,128344.0,True,s. monique,limon,"LIMON, S. MONIQUE",605884.0,1.0,0.91
37203,charlie,schaupp,Charlie Schaupp,68170.0,False,charles,schaupp,"SCHAUPP, CHARLES E.",61275.25,0.94,1.0
41766,matt,dababneh,Matt Dababneh,111148.0,True,matthew,dababneh,"DABABNEH, MATTHEW",1432829.63,0.91,1.0
43226,eloise,reyes,Eloise Reyes,62432.0,True,eloise gomez,reyes,"REYES, ELOISE GOMEZ",708579.31,0.9,1.0
46707,matthew gene,craffey,Matthew Gene Craffey,54016.0,False,matthew,craffey,"CRAFFEY, MATTHEW G.",9600.0,0.92,1.0
53222,reginald byron,jones-sawyer,Reginald Byron Jones-Sawyer,77324.0,True,reginald byron,jones-sawyer sr.,"JONES-SAWYER SR., REGINALD BYRON",842523.87,1.0,0.95


This list highlights some successes with these thresholds but also highlights some areas that could be investigated further (e.g. names in brackets or quotes)

In [11]:
len(match_90)

175

In [12]:
len(results[(results['jwscore_fn'] > 0.95) & (results['jwscore_ln'] > 0.95)])

159

In [13]:
len(results[(results['jwscore_fn'] == 1) & (results['jwscore_ln'] == 1)])

155

In [14]:
len(results[(results['jwscore_fn'] > 0.85) & (results['jwscore_ln'] > 0.85)])

184

In [15]:
match_85 = results[(results['jwscore_fn'] > 0.85) & (results['jwscore_ln'] > 0.85)]

In [16]:
match_85[(match_85['jwscore_fn'] < 1) | (match_85['jwscore_ln'] < 1)].head(20)

Unnamed: 0,election_fn,election_ln,election_name,vote_total,is_winner,donation_fn,donation_ln,candidate_name,total_transaction,jwscore_fn,jwscore_ln
3785,"edmund ""jerry""",brown,"Edmund G. ""Jerry"" Brown*",4388368.0,False,edmund (jerry),brown,"BROWN, EDMUND G. (JERRY)",2700.0,0.94,1.0
11837,"veronica ""roni""",jacobi,"Veronica ""Roni"" Jacobi",65355.0,False,veronica,jacobi,"JACOBI, VERONICA S.",18115.0,0.91,1.0
13724,susan talamantes,eggman,Susan Talamantes Eggman,86315.0,True,susan,eggman,"EGGMAN, SUSAN",721363.71,0.86,1.0
18809,"carlos ""chuck""",taylor,"Carlos ""Chuck"" Taylor",37180.0,False,carlos,taylor,"TAYLOR, CARLOS",6450.0,0.89,1.0
20239,bill,quirk,Bill Quirk,114001.0,True,bill (william),quirk,"QUIRK, BILL (WILLIAM J.)",697958.72,0.86,1.0
30670,rudy,salas,Rudy Salas,53056.0,True,rudy,salas jr.,"SALAS JR., RUDY",1181293.78,1.0,0.91
32140,vince,fong,Vince Fong,123959.0,True,vincent,fong,"FONG, VINCENT",752160.44,0.94,1.0
34628,s. monique,limón,S. Monique Limón,128344.0,True,s. monique,limon,"LIMON, S. MONIQUE",605884.0,1.0,0.91
37203,charlie,schaupp,Charlie Schaupp,68170.0,False,charles,schaupp,"SCHAUPP, CHARLES E.",61275.25,0.94,1.0
41766,matt,dababneh,Matt Dababneh,111148.0,True,matthew,dababneh,"DABABNEH, MATTHEW",1432829.63,0.91,1.0


In [17]:
match_85_90 = results[(results['jwscore_fn'] > 0.85) & (results['jwscore_ln'] > 0.9)]

In [18]:
match_85_90[(match_85_90['jwscore_fn'] < 1) | (match_85_90['jwscore_ln'] < 1)].head(20)

Unnamed: 0,election_fn,election_ln,election_name,vote_total,is_winner,donation_fn,donation_ln,candidate_name,total_transaction,jwscore_fn,jwscore_ln
3785,"edmund ""jerry""",brown,"Edmund G. ""Jerry"" Brown*",4388368.0,False,edmund (jerry),brown,"BROWN, EDMUND G. (JERRY)",2700.0,0.94,1.0
11837,"veronica ""roni""",jacobi,"Veronica ""Roni"" Jacobi",65355.0,False,veronica,jacobi,"JACOBI, VERONICA S.",18115.0,0.91,1.0
13724,susan talamantes,eggman,Susan Talamantes Eggman,86315.0,True,susan,eggman,"EGGMAN, SUSAN",721363.71,0.86,1.0
18809,"carlos ""chuck""",taylor,"Carlos ""Chuck"" Taylor",37180.0,False,carlos,taylor,"TAYLOR, CARLOS",6450.0,0.89,1.0
20239,bill,quirk,Bill Quirk,114001.0,True,bill (william),quirk,"QUIRK, BILL (WILLIAM J.)",697958.72,0.86,1.0
30670,rudy,salas,Rudy Salas,53056.0,True,rudy,salas jr.,"SALAS JR., RUDY",1181293.78,1.0,0.91
32140,vince,fong,Vince Fong,123959.0,True,vincent,fong,"FONG, VINCENT",752160.44,0.94,1.0
34628,s. monique,limón,S. Monique Limón,128344.0,True,s. monique,limon,"LIMON, S. MONIQUE",605884.0,1.0,0.91
37203,charlie,schaupp,Charlie Schaupp,68170.0,False,charles,schaupp,"SCHAUPP, CHARLES E.",61275.25,0.94,1.0
41766,matt,dababneh,Matt Dababneh,111148.0,True,matthew,dababneh,"DABABNEH, MATTHEW",1432829.63,0.91,1.0


In [19]:
len(match_85_90)

183