# sparse_dot_topn Cosine Similarity

https://towardsdatascience.com/de-duplicate-the-duplicate-records-from-scratch-f6e5ad9e79da

https://github.com/ing-bank/sparse_dot_topn/blob/master/sparse_dot_topn/awesome_cossim_topn.py

https://medium.com/wbaa/https-medium-com-ingwbaa-boosting-selection-of-the-most-similar-entities-in-large-scale-datasets-450b3242e618

https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

https://laptrinhx.com/fast-string-matching-in-python-1391866645/

In [10]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/Seattle_Hotels_Duplicates.csv', encoding="latin-1")
df.head()

Unnamed: 0,name,address
0,Hilton Garden Inn Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA"
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA"
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101"
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101"
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA"


In [3]:
df[df.duplicated()]

Unnamed: 0,name,address
28,Ace Hotel Seattle,"2423 1st Ave, Seattle, WA 98121"
92,Seattle Inn Northgate,"12035 Aurora Ave N, Seattle, WA 98133"


In [4]:
df[df['name']=="Ace Hotel Seattle"]

Unnamed: 0,name,address
16,Ace Hotel Seattle,"2423 1st Ave, Seattle, WA 98121"
28,Ace Hotel Seattle,"2423 1st Ave, Seattle, WA 98121"


In [5]:
df['name_address']=df['name'] + df['address']
name_address = df['name_address']
vectorizer = TfidfVectorizer("char", ngram_range=(1,4), sublinear_tf=True)
tf_idf_matrix = vectorizer.fit_transform(name_address)

In [16]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
  
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
    
    print('A.shape', M, _,'B.shape',_, N)
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
    print("dfdfdfdf.........\n",nnz_max)
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    #print(data)
    
    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 5)
print(matches)

A.shape 168 2858 B.shape 2858 168
dfdfdfdf.........
 840
  (0, 0)	0.9999999999999997
  (0, 9)	0.5824171218360623
  (0, 1)	0.3135113883850482
  (0, 4)	0.31182166147590334
  (0, 6)	0.23730879157386087
  (1, 1)	0.9999999999999997
  (1, 6)	0.37350377496055537
  (1, 4)	0.34081048235170586
  (1, 0)	0.3135113883850482
  (1, 12)	0.1378160050853022
  (2, 2)	0.9999999999999997
  (2, 12)	0.2483514281936836
  (2, 55)	0.22749304847306903
  (2, 147)	0.22472137317266058
  (2, 25)	0.1762294271026541
  (3, 3)	0.9999999999999996
  (3, 11)	0.14172746547004708
  (3, 33)	0.13191309118835334
  (3, 163)	0.11319401219045719
  (3, 107)	0.10020761079690314
  (4, 4)	0.9999999999999996
  (4, 1)	0.34081048235170586
  (4, 0)	0.31182166147590334
  (4, 6)	0.23956702643172306
  (4, 12)	0.11151354775913253
  :	:
  (163, 163)	0.9999999999999996
  (163, 33)	0.3476878520347567
  (163, 11)	0.2775417459288508
  (163, 107)	0.2716986280293083
  (163, 164)	0.24838232512963154
  (164, 164)	1.0
  (164, 107)	0.385089856273933
  (

In [19]:
def get_matches_df(sparse_matrix, name_vector, top=840):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similarity': similairity})

matches_df = get_matches_df(matches, name_address)

In [20]:
matches_df[matches_df['similarity'] < 0.99999].sort_values(by=['similarity'], ascending=False).head(30)

Unnamed: 0,left_side,right_side,similarity
206,Holiday Inn Express & Suites Seattle-City Cent...,Holiday Inn Express & Suites Seattle City Cent...,0.680559
256,Holiday Inn Express & Suites Seattle City Cent...,Holiday Inn Express & Suites Seattle-City Cent...,0.680559
826,Pike's Place Lux Suites by Barsala2nd Ave and ...,Pike's Place Lux Suites by Barsala2rd Ave and ...,0.673365
831,Pike's Place Lux Suites by Barsala2rd Ave and ...,Pike's Place Lux Suites by Barsala2nd Ave and ...,0.673365
181,Travelodge Seattle by The Space Needle200 6th ...,Travelodge Seattle by The Space Needle200 6th ...,0.629785
211,Travelodge Seattle by The Space Needle200 6th ...,Travelodge Seattle by The Space Needle200 6th ...,0.629785
791,citizenM Seattle South Lake Union hotel201 Wes...,citizenM Seattle South Lake Union hotel201 Wes...,0.629524
836,citizenM Seattle South Lake Union hotel201 Wes...,citizenM Seattle South Lake Union hotel201 Wes...,0.629524
1,Hilton Garden Inn Seattle Downtown1821 Boren A...,Hilton Garden Inn Seattle Downtown1821 Boren A...,0.582417
46,Hilton Garden Inn Seattle Downtown1821 Boren A...,Hilton Garden Inn Seattle Downtown1821 Boren A...,0.582417


In [21]:
matches_df[matches_df['similarity'] < 0.50].right_side.nunique()

147