## Cosine Similarity

#### Grabbing Sample Data From Record Linkage Module

In [206]:
import recordlinkage
from recordlinkage.datasets import load_febrl4
dfA, dfB = load_febrl4()

In [207]:
dfA

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-1070-org,michaela,neumann,8,stanley street,miami,winston hills,4223,nsw,19151111,5304218
rec-1016-org,courtney,painter,12,pinkerton circuit,bega flats,richlands,4560,vic,19161214,4066625
rec-4405-org,charles,green,38,salkauskas crescent,kela,dapto,4566,nsw,19480930,4365168
rec-1288-org,vanessa,parr,905,macquoid place,broadbridge manor,south grafton,2135,sa,19951119,9239102
rec-3585-org,mikayla,malloney,37,randwick road,avalind,hoppers crossing,4552,vic,19860208,7207688
...,...,...,...,...,...,...,...,...,...,...
rec-2153-org,annabel,grierson,97,mclachlan crescent,lantana lodge,broome,2480,nsw,19840224,7676186
rec-1604-org,sienna,musolino,22,smeaton circuit,pangani,mckinnon,2700,nsw,19890525,4971506
rec-1003-org,bradley,matthews,2,jondol place,horseshoe ck,jacobs well,7018,sa,19481122,8927667
rec-4883-org,brodee,egan,88,axon street,greenslopes,wamberal,2067,qld,19121113,6039042


In [208]:
dfB

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-561-dup-0,elton,,3,light setreet,pinehill,windermere,3212,vic,19651013,1551941
rec-2642-dup-0,mitchell,maxon,47,edkins street,lochaoair,north ryde,3355,nsw,19390212,8859999
rec-608-dup-0,,white,72,lambrigg street,kelgoola,broadbeach waters,3159,vic,19620216,9731855
rec-3239-dup-0,elk i,menzies,1,lyster place,,northwood,2585,vic,19980624,4970481
rec-2886-dup-0,,garanggar,,may maxwell crescent,springettst arcade,forest hill,2342,vic,19921016,1366884
...,...,...,...,...,...,...,...,...,...,...
rec-4495-dup-0,connor,belperio,15,,,ryde,2570,nsw,19170518,5394641
rec-4211-dup-0,daniel,maspn,9,derrington crescent,el pedro caravan park,sunnybank,4350,vic,19500705,5525378
rec-3131-dup-0,samuel,crofs,613,banjine street,kurrajong vlge,pengzin,2230,qld,19410531,4467228
rec-3815-dup-0,saah,beattih,60,kay's place,oldershaw court,ashfield,2047,vic,19500712,9435148


#### Clean Data by:
0. Replace NaN with 'nan' strings
1. Put together full name 
2. Put together the full street address
3. Create n_grams of size 5 for all 
4. Remove special characters
5. Make all letters lowercase

Note: There may be more cleaning that needs to happen in the data you are dealing with, but for this case, this is all that needs to happen. 

In [209]:
def clean_data(df):
    """
    creates a new column with the given_name and surname
    and addresses as well
    returns the dataframe
    """
    df = df.fillna('nan')
    df['address'] = df['street_number'] + ' ' + df['address_1'] + ' ' + df['address_2'] + \
                    ' ' + df['state'] + ' ' + df['postcode']
    df['name'] = df['given_name'] + ' ' + df['surname']
    df.reset_index(inplace=True)
    return df        

In [210]:
def ngrams(string, n = 5):
    """
    creates ngrams that separates a string into
    sizes of 5 (or whatever you choose for n)
    makes it lowercase
    replaces non-useful characters
    returns a list of the ngrams
    """
    print('running ngrams function')
    string = fix_text(string)
    string = string.encode("ascii", errors = "ignore").decode()
    string = string.lower()
    chars_to_remove = ["(",")",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx,'',string)
    string = string.replace('&','and')
    string = string.replace(',',' ')
    string = string.replace('-',' ')
    string = string.title()
    string = re.sub(' +', ' ',string).strip()
    string = ' ' + string + ' '
    string = re.sub(r'[,-./]|\sBD',r'',string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [211]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    """
    force A and B as a CSR matrix.
    If they have already been CSR, there is no overhead
    """
    print('Running awesome_cossim_top function')
    A = A.tocsr()
    B = B.tocsr()
    #----------------------------------
    M, _ = A.shape
    _, N = B.shape
    #----------------------------------
    print('creating the cosine similarity matrix')
    idx_dtype = np.int32
    nnz_max = M*ntop
    indptr = np.zeros(M+1, dtype=idx_dtype)
    print(f'here is indptr, nnz_max and idx_dtype {indptr}, {nnz_max}, {idx_dtype}')
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    print(f'here are indices and data {indices} and {data}')
    #----------------------------------
    print(f'here is A.indptr {A.indptr}')
    print(f'here is A.indices {A.indices}')
    print(f'here is A.data {A.data}')
    print(f'here is B.indptr {B.indptr}')
    print(f'here is B.indices {B.indices}')
    print(f'here is B.data {B.data}')
    print(f'here is ntop {ntop}')
    print(f'here is lower_bound {lower_bound}')
    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)
    print('at the end of awesome_cossim_top')
    return csr_matrix((data,indices,indptr),shape=(M,N))

In [212]:
def get_matches_df(sparse_matrix, A, B, df, df2, top=1):
    print('in the get_matches_df function')
    non_zeros = sparse_matrix.nonzero()
    #----------------------------------
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    #----------------------------------
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    print('Creating columns for dataframe')
    #----------------------------------
    nameA = np.empty([nr_matches], dtype=object)
    nameB = np.empty([nr_matches], dtype=object)
    addressA = np.empty([nr_matches], dtype=object)
    addressB = np.empty([nr_matches], dtype=object)
    dobA = np.empty([nr_matches], dtype=object)
    dobB = np.empty([nr_matches], dtype=object)
    ssnA = np.empty([nr_matches], dtype=object)
    ssnB = np.empty([nr_matches], dtype=object)
    rec_idA = np.empty([nr_matches], dtype=object)
    rec_idB = np.empty([nr_matches], dtype=object)
    similarity = np.zeros(nr_matches)
    #----------------------------------
    print('before the for loop')
    for index in range(0, nr_matches):
        # A and df
        nameA[index]  = A[sparserows[index]]
        addressA[index] = df.iloc[sparserows[index]].loc['address']
        dobA[index]     = df.iloc[sparserows[index]].loc['date_of_birth']
        ssnA[index]     = df.iloc[sparserows[index]].loc['soc_sec_id']
        rec_idA[index] = df.iloc[sparserows[index]].loc['rec_id']
        # B and df2
        nameB[index]    = B[sparserows[index]]
        addressB[index] = df2.iloc[sparserows[index]].loc['address']
        dobB[index]       = df2.iloc[sparserows[index]].loc['date_of_birth']
        ssnB[index]       = df2.iloc[sparserows[index]].loc['soc_sec_id']
        rec_idB[index]   = df2.iloc[sparserows[index]].loc['rec_id']
        # similarity calculation
        similarity[index] = sparse_matrix.data[index]
        #------------------------------
        df3 = pd.DataFrame({'rec_idA': rec_idA,
                            'rec_idB': rec_idB,
                            'date_of_birthA':dobA,
                            'date_of_birthB':dobB,
                            'addressA':addressA,
                            'addressB':addressB,
                            'soc_sec_idA':ssnA,
                            'soc_sec_idB':ssnB,
                            'nameA':nameA,
                            'nameB':nameB,
                            'similarity': similarity})
    return df3

In [213]:
dfA = clean_data(dfA.copy())
dfB = clean_data(dfB.copy())

In [214]:
dfA

Unnamed: 0,rec_id,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id,address,name
0,rec-1070-org,michaela,neumann,8,stanley street,miami,winston hills,4223,nsw,19151111,5304218,8 stanley street miami nsw 4223,michaela neumann
1,rec-1016-org,courtney,painter,12,pinkerton circuit,bega flats,richlands,4560,vic,19161214,4066625,12 pinkerton circuit bega flats vic 4560,courtney painter
2,rec-4405-org,charles,green,38,salkauskas crescent,kela,dapto,4566,nsw,19480930,4365168,38 salkauskas crescent kela nsw 4566,charles green
3,rec-1288-org,vanessa,parr,905,macquoid place,broadbridge manor,south grafton,2135,sa,19951119,9239102,905 macquoid place broadbridge manor sa 2135,vanessa parr
4,rec-3585-org,mikayla,malloney,37,randwick road,avalind,hoppers crossing,4552,vic,19860208,7207688,37 randwick road avalind vic 4552,mikayla malloney
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,rec-2153-org,annabel,grierson,97,mclachlan crescent,lantana lodge,broome,2480,nsw,19840224,7676186,97 mclachlan crescent lantana lodge nsw 2480,annabel grierson
4996,rec-1604-org,sienna,musolino,22,smeaton circuit,pangani,mckinnon,2700,nsw,19890525,4971506,22 smeaton circuit pangani nsw 2700,sienna musolino
4997,rec-1003-org,bradley,matthews,2,jondol place,horseshoe ck,jacobs well,7018,sa,19481122,8927667,2 jondol place horseshoe ck sa 7018,bradley matthews
4998,rec-4883-org,brodee,egan,88,axon street,greenslopes,wamberal,2067,qld,19121113,6039042,88 axon street greenslopes qld 2067,brodee egan


In [215]:
dfB

Unnamed: 0,rec_id,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id,address,name
0,rec-561-dup-0,elton,,3,light setreet,pinehill,windermere,3212,vic,19651013,1551941,3 light setreet pinehill vic 3212,elton nan
1,rec-2642-dup-0,mitchell,maxon,47,edkins street,lochaoair,north ryde,3355,nsw,19390212,8859999,47 edkins street lochaoair nsw 3355,mitchell maxon
2,rec-608-dup-0,,white,72,lambrigg street,kelgoola,broadbeach waters,3159,vic,19620216,9731855,72 lambrigg street kelgoola vic 3159,nan white
3,rec-3239-dup-0,elk i,menzies,1,lyster place,,northwood,2585,vic,19980624,4970481,1 lyster place nan vic 2585,elk i menzies
4,rec-2886-dup-0,,garanggar,,may maxwell crescent,springettst arcade,forest hill,2342,vic,19921016,1366884,nan may maxwell crescent springettst arcade vi...,nan garanggar
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,rec-4495-dup-0,connor,belperio,15,,,ryde,2570,nsw,19170518,5394641,15 nan nan nsw 2570,connor belperio
4996,rec-4211-dup-0,daniel,maspn,9,derrington crescent,el pedro caravan park,sunnybank,4350,vic,19500705,5525378,9 derrington crescent el pedro caravan park vi...,daniel maspn
4997,rec-3131-dup-0,samuel,crofs,613,banjine street,kurrajong vlge,pengzin,2230,qld,19410531,4467228,613 banjine street kurrajong vlge qld 2230,samuel crofs
4998,rec-3815-dup-0,saah,beattih,60,kay's place,oldershaw court,ashfield,2047,vic,19500712,9435148,60 kay's place oldershaw court vic 2047,saah beattih


In [216]:
namesA = dfA['name'].to_numpy()
namesB = dfB['name'].to_numpy()

In [217]:
from sklearn.feature_extraction.text import TfidfVectorizer
import sparse_dot_topn.sparse_dot_topn as ct
from scipy.sparse import csr_matrix
from ftfy import fix_text
import pandas as pd
import numpy as np
import re

In [218]:
vectorizer = TfidfVectorizer(min_df = 1, analyzer = ngrams)
tf_idf_namesA = vectorizer.fit_transform(namesA)
tf_idf_namesB = vectorizer.transform(namesB)

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams function
running ngrams f

In [None]:
matches = awesome_cossim_top(tf_idf_namesA, tf_idf_namesB.transpose(), 10, 0)
top_values = len(matches.data)
print(f'size of top_values {top_values}')
matches_df = get_matches_df(matches, namesA, namesB, top = top_values, df = dfA, df2 = dfB)

Running awesome_cossim_top function
creating the cosine similarity matrix
here is indptr, nnz_max and idx_dtype [0 0 0 ... 0 0 0], 50000, <class 'numpy.int32'>
here are indices and data [0 0 0 ... 0 0 0] and [0. 0. 0. ... 0. 0. 0.]
here is A.indptr [    0    14    28 ... 56716 56725 56738]
here is A.indices [12620 17196  9022 ...  1059 11605 12086]
here is A.data [0.23212043 0.27628192 0.28072735 ... 0.27474423 0.25212359 0.23234254]
here is B.indptr [    0     1     2 ... 47392 47393 47397]
here is B.indices [3997 2848  184 ... 1041 1794 2171]
here is B.data [0.30067206 0.32743095 0.28413629 ... 0.26219089 0.36482361 0.2985141 ]
here is ntop 10
here is lower_bound 0
at the end of awesome_cossim_top
size of top_values 48886
in the get_matches_df function
Creating columns for dataframe
before the for loop


In [None]:
matches_df

In [None]:
matches_df[matches_df['similarity'] > .85]

In [None]:
from pyphonetics import Soundex
import pandas as pd
import numpy as np
import sys, re
import pyodbc
import importlib
import jellyfish

In [None]:
def levenshtein_per_row(x, y):
    """
    calculates the levenshtein distance between two strings
    x and y - strings to compare
    returns the distance value
    """
    print('Wrapper function to handle None entries for levenshtein distance')
    if x is None or y is None:
        return -1
    else:
        return jellyfish.levenshtein_distance(x, y)
#---------------------------------------------------------------
def jaro_per_row(x,y):
    """
    calculates the jaro distance between two strings
    x and y - strings to compare
    returns the distance value
    """
    print('Wrapper function to handle None entries for jaro distance')
    if x is None or y is None:
        return -1
    else:
        return jellyfish.jaro_distance(x, y)
#---------------------------------------------------------------
def soundex_calc(column1, column2, df):
    """
    calculates the levenshtein difference between 
    two columns - column1 and column2
    returns a list of the distances
    """
    soundex = Soundex()
    soundex_column = []
    #----------------------------------
    print('running the levenshtein distance of the soundex')
    for i in range(len(df)):
        try:
            calculation = soundex.distance(df.loc[i][column1], df.loc[i][column2])
            soundex_column.append(calculation)
        except Exception as exc:
            soundex_column.append(0)
    return soundex_column

##### Calculate the different metrics to compare names further:

In [None]:
matches_df['levenshtein_distance'] = matches_df.apply(lambda x: distance_metrics.levenshtein_per_row(x['nameA'], x['nameB']),axis=1)

matches_df['jaro_distance'] = matches_df.apply(lambda x: distance_metrics.jaro_per_row(x['nameA'], x['nameB']),axis=1)

matches_df['soundex'] = soundex_calc('nameA','nameB',matches_df)

In [None]:
matches_df