In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sparse_dot_topn import awesome_cossim_topn 
import re

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
gt = pd.read_csv('raw/sf_export.csv')
gt.dropna(subset=['Salesforce Contact Id'], inplace= True)
gt.drop_duplicates(subset=['Salesforce Account Id'], inplace= True)
gt['Key Value'] = gt.apply(lambda x: (str(x['Account Name'])+ ' ' + str(x['State 1'])).lower(), axis= 1)
gt.reset_index(drop=True, inplace=True)

In [4]:
nm = pd.read_csv('raw/pennsylvania-00001.csv')

In [5]:
nm['Key Value'] = nm.apply(lambda x: (str(x['firstName']) + ' ' + str(x['lastName']) + ' ' + str('PA')).lower(), axis= 1)

In [6]:
import re

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
combined_list = nm['Key Value'].tolist() + gt['Key Value'].tolist()
vectorizer.fit(nm['Key Value'].tolist() + gt['Key Value'].tolist())

nm_tfidf = nm['Key Value'].tolist()
nm_tfidf = vectorizer.transform(nm_tfidf)

gt_tfidf = gt['Key Value']
gt_tfidf = vectorizer.transform(gt_tfidf)

In [8]:
nm_tfidf.shape

(126060, 14956)

In [9]:
#gt meaning ground truth or reference table

gt_tfidf.shape

(1029533, 14956)

In [26]:
126060 * 1029533

129782929980

In [10]:
matches = awesome_cossim_topn(nm_tfidf, gt_tfidf.transpose(), 10, 0.8, use_threads=True, n_jobs=6)

In [17]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size

    left_side = np.empty([nr_matches], dtype=object)
    index_value = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    contact_id = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = nm.loc[sparserows[index], 'Key Value']
        index_value[index] = sparserows[index]
        right_side[index] = gt.loc[sparsecols[index], 'Key Value']
        contact_id[index] = gt.loc[sparsecols[index], 'Salesforce Contact Id']
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'index':index_value,
                          'left_side': left_side,
                          'right_side': right_side,
                          'Contact ID':contact_id, 
                           'similairity': similairity})

In [18]:
matches_df = get_matches_df(matches, combined_list, top=100000)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
# matches_df.sample(10)


In [20]:
matches_df.to_csv('result.csv', index=False)

In [25]:
matches_df.sort_values(by=['similairity'], ascending= False)

Unnamed: 0,index,left_side,right_side,Contact ID,similairity
21316,13079,nydia diaz-buxo pa,nydia diaz-buxo pr,0033000000HYn2IAAT,0.981257
11573,7342,marlene bell pa,arlene bell pa,0036000001T2jTGAAZ,0.974377
36675,23547,mehrdad barikbin pa,mehrdad barikbin ca,0036000001SNpHOAA1,0.973979
95278,60215,francisco troncoso pa,francisco troncoso pr,0033000000HXjUlAAL,0.972968
11142,7055,lizabeth brown pa,elizabeth brown pa,0034y00002ZjdKFAAZ,0.972474
...,...,...,...,...,...
70221,47341,allen levin pa,allen levine ny,0033000000HYmvxAAD,0.800007
86816,56077,allen levin pa,allen levine ny,0033000000HYmvxAAD,0.800007
18115,11445,william cherry pa,william cherry jr ca,0036000001TVaL2AAL,0.800005
78375,51513,james wallace pa,james wallace jr. tx,0036000001SOEtLAAX,0.800004
