## Fuzzy Match POC with Apache Spark
The objective of this project is to test the execution of native spark functions to perform string similarity analysis, with variated similarity analysis algorithms

### Approaches

- 2nd Approach: Use of Term Frequency, Inverse Document Frequency (TF-IDF) and only then applying native Scala Spark SQL fuzzy match algorithms

References:
- [Josh Taylor: Fuzzy matching at scale](https://towardsdatascience.com/fuzzy-matching-at-scale-84f2bfd0c536#:~:text=The%20problem%20with%20Fuzzy%20Matching%20on%20large%20data&text=In%20computer%20science%2C%20this%20is,that%20works%20in%20quadratic%20time.)

In [1]:

import pandas as pd
import names
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

### Prepare datasets
`targets` is the name list that will be looked up inside `comparison`

In [2]:
targets = []
comparison = []

targets.append({'SimilarityWith': 'HP PARTICIPACOES S/A'})
targets.append({'SimilarityWith': 'SAMSUMG'})  
targets.append({'SimilarityWith': 'JOAO PEDRO PAULO'})  
targets.append({'SimilarityWith': 'PEDRO CHRISTIAN DA SILVA JR.'})
targets.append({'SimilarityWith': 'LUIS CARLOS JR'})
targets.append({'SimilarityWith': 'JOAO PEDRO FREDERICH HESRV'})
targets.append({'SimilarityWith': 'CAMARGO CORREA SA'})
targets.append({'SimilarityWith': 'SANTOS BRASIL TERMINAL PORTUARIO DE EXPORTACAO'})
targets.append({'SimilarityWith': 'CONCAIS TERMINAL PORTUARIO'})
targets.append({'SimilarityWith': 'JUNIOR RONALD FLINDSMAN'})
targets.append({'SimilarityWith': 'COCA COLA LTDA'})
targets.append({'SimilarityWith': 'BRASTEMP PRODUTOS TECNOLOGICOS LTDA'})
targets.append({'SimilarityWith': 'MINERVA EXPORTACAO DE CARNES'})
targets.append({'SimilarityWith': 'CENTRO DE CONVENCOES JESUS LUZ'})
targets.append({'SimilarityWith': 'JUAN MERCADO DA SILVA'})

#-----------------------------------NOME 2--------------------------------------
comparison.append({'Name': 'HAP PARCTICIPACOES S/A'})
comparison.append({'Name': 'HPPY PARTY S/A'})
comparison.append({'Name': 'HIPPIES PATTERN S/A'})
comparison.append({'Name': 'SAMSUNG'}) 
comparison.append({'Name': 'JOHN PEDRO PAULO'}) 
comparison.append({'Name': 'JOAO ROBERTO DA SILVA'})
comparison.append({'Name': 'JOANA PEDROSO FERRAZ'})
comparison.append({'Name': 'PEDRO SILVA JUNIOR'})
comparison.append({'Name': 'PERSIO FERREIRA JUNIOR'})
comparison.append({'Name': 'LUIS CARLOS OLIVERIA JUNIOR'})
comparison.append({'Name': 'LUISA CAROLINA BORGES'})
comparison.append({'Name': 'JOAO FRED HESRV'})
comparison.append({'Name': 'FREDERICO HENRIQUE'})
comparison.append({'Name': 'FREDERICO PEDROZO DE MORAES'})
comparison.append({'Name': 'C CORREA EMPREENDIMENTOS SA'})
comparison.append({'Name': 'CENTRO CORREAS E ACESSORIOS'})
comparison.append({'Name': 'CISNEI CISCORREA EMPREITEIRA SA'})
comparison.append({'Name': 'TRASNPETRO EXPORTACAO'})
comparison.append({'Name': 'CONCAIS TERMINAL'})
comparison.append({'Name': 'RONALD FLINDSMAN'})
comparison.append({'Name': 'RONALDO FLETCHER ARMANDO'})
comparison.append({'Name': 'COCA COLA LTDA'})
comparison.append({'Name': 'COCADA DA MARIA LTDA'})
comparison.append({'Name': 'BRASTEMP TECH LTDA'})
comparison.append({'Name': 'BRASIL TECNOLOGIA LTDA'})
comparison.append({'Name': 'MINERVA COMEX SERVICOS ALIMENTICIOS'})
comparison.append({'Name': 'MINERIO EXPLORACAO E CAVAGEM'})
comparison.append({'Name': 'MINAS GERAIS EXP'})
comparison.append({'Name': 'CENTRO DE CONVENCOES SAO PAULO EXPOCENTER'})
comparison.append({'Name': 'CELTIC CONNECTION SAO PAULO'})
comparison.append({'Name': 'MERCADO DA SILVIA'})
comparison.append({'Name': 'JOANA DA SILVA'})
comparison.append({'Name': 'Raphael Almeida Balogo'})
comparison.append({'Name': 'XY'})

print(len(targets))
print(len(comparison))

print(f"CrossJoined dataset size: {len(targets) * len(comparison)}")

15
34
CrossJoined dataset size: 510


<hr>

### Term Frequency, Inverse Document Frequency (TF-IDF) Approach 


Function that generates list of 3 char length ngrams from full string 

In [3]:
def ngrams(string, n=3):
    ngs = zip(*[string[i:] for i in range(n)])
    return [''.join(n) for n in ngs]

In [7]:
targets_list = list(set(map(lambda x: x['SimilarityWith'], targets)))
comparison_list = list(set(map(lambda x: x['Name'], comparison)))

targets_count = len(targets_list)

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(targets_list)
nbrs_model = NearestNeighbors(n_neighbors=targets_count, n_jobs=-1).fit(tfidf)

def getNearestN(query, nbrs):
  queryTFIDF_ = vectorizer.transform(query)
  distances, indices = nbrs.kneighbors(queryTFIDF_)
  return distances, indices

dedup_distances, dedup_indices = getNearestN(targets_list, nbrs_model)
dedup_matches = []

max_dedup_distance = max([max(dist) for dist in dedup_distances])

for i, j in enumerate(dedup_indices):
  for x in range(targets_count):
    temp = [1 - dedup_distances[i][x] / max_dedup_distance, targets_list[j[x]], targets_list[i]]
    dedup_matches.append(temp)

targets_list = list(targets_list)

dedup_matches_df = pd.DataFrame(dedup_matches, columns=['Distance', 'Target name', 'Dataset name'])
dedup_matches_df = dedup_matches_df.loc[dedup_matches_df['Distance'] > 0.5]
dedup_matches_df = dedup_matches_df \
  .drop_duplicates() \
  .groupby(['Target name']) \
  .size().reset_index(name='counts') \
  .sort_values(by=['counts'], ascending=False) 

neighbors_count = dedup_matches_df['counts'].iloc[0] + 2
print(f'Max neighbors possible: {neighbors_count}')

dedup_matches_df.to_csv('tfidf_knn_dedups.csv', index=False)

Max neighbors possible: 3


In [None]:

distances, indices = getNearestN(comparison_list, nbrs_model)
comparison_list = list(comparison_list)
max_distance = round(max([max(dist) for dist in distances]),2)

print(f'Max distance: {max_distance}')

matches = []
for i,j in enumerate(indices):
  for x in range(neighbors_count):
    temp = [round(distances[i][x],2), targets_list[j[x]], comparison_list[i]]
    matches.append(temp)
    
print(len(matches))

In [245]:
matches_df = pd.DataFrame(matches, columns=['Distance','Target name','Dataset name'])

matches_df = matches_df.loc[matches_df['Distance'] < 1.14].loc[matches_df['Distance'] != 1]
matches_df = matches_df.drop_duplicates().sort_values(
    by=['Target name', 'Distance'], ascending=True)
    
matches_df.to_csv('tfidf_knn.csv', index=False)
print(len(matches_df))

34
