In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sparse_dot_topn import awesome_cossim_topn 
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from deduping_module import deduping_class

In [2]:
link = r'C:\Users\USER\Documents\LM_project\SAL-230_Revisit validity'
gt = pd.read_csv(link + '\\raw\sf_export.csv', encoding='latin-1')
nm = pd.read_csv(link + '\\raw\pennsylvania-00001.csv', encoding='latin-1')

In [3]:
gt.dropna(subset= ['Salesforce Contact Id'], inplace= True)
gt.drop_duplicates(subset=['Salesforce Account Id'], inplace= True) # need to remove the duplicates in the salesforce account id if we dedupe on the account object
gt.reset_index(drop=True, inplace= True) #always make sure that the index of the inputs are in numerical order or this will cause errors in getting the matches
gt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1029533 entries, 0 to 1029532
Data columns (total 11 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Salesforce Account Id  1029533 non-null  object 
 1   Salesforce Contact Id  1029533 non-null  object 
 2   Account Name           1029532 non-null  object 
 3   First Name             1028281 non-null  object 
 4   Last Name              1029506 non-null  object 
 5   Email                  614566 non-null   object 
 6   Phone                  1001523 non-null  object 
 7   Zip Code 1             805570 non-null   object 
 8   County 1               785468 non-null   object 
 9   State 1                1005552 non-null  object 
 10  Quarantine             1029533 non-null  float64
dtypes: float64(1), object(10)
memory usage: 86.4+ MB


Note:
1. the footer on the ground truth file should always be dropped

In [4]:
# initializing the module
deduping = deduping_class(gt) 

In [5]:
deduping.ground_truth.head()

Unnamed: 0,Salesforce Account Id,Salesforce Contact Id,Account Name,First Name,Last Name,Email,Phone,Zip Code 1,County 1,State 1,Quarantine
0,0014y00002LvzGCAAZ,0034y00002QqcZtAAJ,Melcher Law,Doug,Melcher,,(617) 485-0859,,,,0.0
1,0014y00002aVU9TAAW,0034y00002ZjWOiAAN,Kevin Barnett,Kevin,Barnett,kb@barnettfalls.com,,,,NC,0.0
2,0014y00002aVjVkAAK,0034y00002ZjlorAAB,Kellie Jenkins Hohenshelt,Kellie Jenkins,Hohenshelt,,(214) 587-7044,,,TX,0.0
3,0014y00002aVbeeAAC,0034y00002ZjdxQAAR,Jackalynne Fletcher,Jackalynne,Fletcher,,(614) 487-8283,,,OH,0.0
4,0014y00002aVn5XAAS,0034y00002ZjpOtAAJ,Timothy Blair,Timothy,Blair,,(713) 818-8474,,,TX,0.0


#### Step 1:
1. Make sure that the ground truth columns are similar with the dataframe to be matched

In [6]:
# matching the ground truth columns with the to match dataframe
nm.rename(columns={'firstName':'First Name', 'lastName':'Last Name', 'email':'Email', 'phone':'Phone', 'zip':'Zip Code 1'}, inplace= True)

In [7]:
nm.head(3)

Unnamed: 0,id,title,Last Name,First Name,suffix,status,city,state,county,country,district,Phone,otherPhone,faxNumber,Email,employer,street,middleInitial,admissionDate,Zip Code 1
0,7,,Baird,Robert,,Deceased,CLAIRTON,PENNSYLVANIA,ALLEGHENY,UNITED STATES,District 4,(412) 233-6260,,,,,441 MITCHELL AVE,D,9/29/1958,15025
1,24,,Auld,Howard,,Deceased,ALLISON PARK,PENNSYLVANIA,ALLEGHENY,UNITED STATES,District 4,(412) 486-2300,,,,,2589 DUNCAN AVENUE,S,3/25/1946,15101
2,36,,Banyasz,William,Jr.,Deceased,BONITA SPRINGS,FLORIDA,OUT OF STATE,UNITED STATES,Out of State,(239) 498-4344,,,,,26000 HICKORY BOULEVARD #407,R,3/23/1970,34134


#### Step 2
Now that the columns are matched, we can use the key_selector function. \
This takes 2 inputs, *args and "data=dataframe to be matched"\
This function will return the to be matched dataframe with primary_key value while also adding it in the ground truth but only in the backend

In [8]:
nm_primarykey = deduping.key_selector('First Name', 'Last Name', 'Phone', data= nm)

#### Optional Step
We can set an optional paramater called ngrams, this means the number of combination the txt will be divided. \
if this is not set, it will automatically equal to 3

In [9]:
deduping.set_ngrams(4)

#### Step 3
This function will convert the ground truth and dataframe to be matched in to tfidf sparse matrix \
the input to this function should be the dataframe with the primarykey \
the results will now be included in the class module

self.nm_tfidf\
self.gt_tfidf

In [10]:
deduping.vectorizer(nm_primarykey)

In [11]:
matches = awesome_cossim_topn(deduping.nm_tfidf, deduping.gt_tfidf.transpose(), 10, 0.8, use_threads=True, n_jobs=6)

In [28]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top & top < sparsecols.size:
        nr_matches = top
    else:
        print("The top value is not set or the value exceeds the nonzero size")
        nr_matches = sparsecols.size

    left_side = np.empty([nr_matches], dtype=object)
    index_value = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    contact_id = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = nm.loc[sparserows[index], 'primary_key']
        index_value[index] = sparserows[index]
        right_side[index] = gt.loc[sparsecols[index], 'primary_key']
        contact_id[index] = gt.loc[sparsecols[index], 'Salesforce Contact Id']
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'index':index_value,
                          'left_side': left_side,
                          'right_side': right_side,
                          'Contact ID':contact_id, 
                           'similairity': similairity})

In [29]:
matches_df = get_matches_df(matches, deduping.combined_list, top=100000)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
# matches_df.sample(10)


The top value is not set or the value exceeds the nonzero size


In [15]:
matches_df

Unnamed: 0,index,left_side,right_side,Contact ID,similairity
1,16,CharlesAlbright(412) 531-4732,CharlesAlbright Jr.(412) 531-4732,0036000001T2kjtAAB,0.816244
10,44,EdwardBrownnan,RichardBrownnan,0034y00002ZjVGvAAN,0.837507
11,44,EdwardBrownnan,RichardBrownnan,0034y00002ZjVF9AAN,0.837507
22,85,KennethBehrend(412) 391-2515,KennethBehrend(412) 391-2503,0033000000HYk7zAAD,0.899059
30,116,CarlBrueck(407) 889-0822,CarlBrueck Jr.(407) 889-0822,0036000001RhZS1AAN,0.816222
...,...,...,...,...,...
22204,125529,StephenHodzic(412) 429-9520,ZanHodzic(412) 429-9520,0033000000HWmwjAAD,0.835147
22206,125597,ThomasHancock(304) 526-3500,ThomasHancock(304) 526-3515,0034y00002ZjbyvAAB,0.893742
22218,125843,MichaelWhalen(949) 833-1703,MichaelWhalen(949) 833-1700,0030e00002NciW7AAJ,0.941175
22220,125876,CarlMeyer(717) 620-1165,CarlMeyer Jr.(717) 620-1165,0034y00002ZjfgQAAR,0.855141


#### Notes:
1. If deduping at account object, always remember to remove the duplicates in the Salesforce Account Id
1. always make sure that the index of the inputs are in numerical order or this will cause errors in getting the matches
1. if a selected key value is missing, it has a significant impact on the performance

In [31]:
nm.loc[125529]

id                                      330820
title                                      NaN
Last Name                               Hodzic
First Name                             Stephen
suffix                                     NaN
status                                  Active
city                                  CARNEGIE
state                             PENNSYLVANIA
county                               ALLEGHENY
country                          UNITED STATES
district                            District 4
Phone                           (412) 429-9520
otherPhone                                 NaN
faxNumber                                  NaN
Email                                      NaN
employer         Hodzic and Porach Law Offices
street             1100 WASHINGTON AVE STE 209
middleInitial                                P
admissionDate                       10/20/2021
Zip Code 1                               15106
primary_key        StephenHodzic(412) 429-9520
Name: 125529,