In [3]:
import pandas as pd
orig = pd.read_csv("./raw/usa_with_details.csv")

In [4]:
orig.columns

Index(['a', 'co', 'c', 'd', 'p', 's', 'si', 'ssi', 'stt', 'st', 'LatLong',
       'Id', 'Zipcode', 'CI', 'SearchError', 'ScamTitle', 'BusinessName', 'C',
       'D', 'Description', 'P', 'PKID', 'S', 'ScamID'],
      dtype='object')

In [5]:
import numpy as np
import re
import textdistance
# we will need scikit-learn>=0.21
from sklearn.cluster import AgglomerativeClustering  

texts = orig["BusinessName"][0:100]

def normalize(text):
  """ Keep only lower-cased text and numbers"""
  return re.sub('[^a-z0-9]+', ' ', text.lower())

def group_texts(texts, threshold=0.3): 
  """ Replace each text with the representative of its cluster"""
  normalized_texts = np.array([normalize(str(text)) for text in texts])
  distances = 1 - np.array([
      [textdistance.jaro_winkler(one, another) for one in normalized_texts] 
      for another in normalized_texts
  ])
  clustering = AgglomerativeClustering(
    distance_threshold=threshold, # this parameter needs to be tuned carefully
    affinity="precomputed", linkage="complete", n_clusters=None
  ).fit(distances)
  centers = dict()
  for cluster_id in set(clustering.labels_):
    index = clustering.labels_ == cluster_id
    centrality = distances[:, index][index].sum(axis=1)
    centers[cluster_id] = normalized_texts[index][centrality.argmin()]
  return [centers[i] for i in clustering.labels_]
groups = group_texts(texts)
for i in range(len(texts)):
    print(f'{i} {groups[i]}')

0 pay to drive advertising
1 better business bereau
2 westchester country club 
3 mvmt marketing inc 
4 krecklo associates world vision
5 zeel
6 tencent 
7 extraymasl
8 gse s p a gestore dei servizi energetici 
9 veloxis pharmaceuticals
10 goat group
11 ssd market research
12 n k i consulting
13 esc technology
14 dp world
15 nexus healthcare
16 genstar capital
17 pacylex pharma
18 career daily
19 dave sandifer hatch
20 envases group company
21 house of schwan inc
22 dynamic collective inc 
23 dunton consulting
24 focalpoint llc 
25 odyssey capital group
26 goat group
27 ewrix
28 nan
29 fedex express
30 zee forwarding services 
31 mgm wrap
32 envases group company
33 goat group
34 vixsta solutions llc
35 envases group company
36 empower recruiter
37 ziprecruiter impostor
38 laminion home care
39 davisware inc chiship services llc 
40 next level nrg
41 syntax post
42 rpd packaging
43 odyssey capital group
44 appdynamics
45 focalpoint llc 
46 abco supply llc imposter
47 ziprecruiter impos

In [6]:
orig["BusinessName"][:60]

0                        Pay To Drive Advertising
1                          Better Business Bereau
2                       Westchester Country Club 
3                            MVMT marketing inc. 
4                Krecklo Associates/ World Vision
5                                            Zeel
6                                        Tencent 
7                                      Extraymasl
8     GSE S.p.A - Gestore dei Servizi Energetici.
9                         Veloxis Pharmaceuticals
10                                     GOAT GROUP
11                            SSD Market Research
12                               N K I Consulting
13                                 ESC Technology
14                                       DP World
15                               Nexus Healthcare
16                                Genstar Capital
17                                 Pacylex Pharma
18                                   Career Daily
19                            Dave Sandifer Hatch


In [7]:
import pandas as pd
import numpy as np

# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process
import chardet

# set seed for reproducibility
np.random.seed(0)

In [12]:
orig_n = orig.shape[0]

In [15]:
cnt = []
for i in range(orig_n):
    if not isinstance(orig["BusinessName"][i],str):
        orig["BusinessName"][i] = ""
        cnt.append(i)
len(cnt)

0

In [18]:
# convert to lower case
orig["BusinessName"] = orig["BusinessName"].str.lower()
# remove trailing white spaces
orig["BusinessName"] = orig["BusinessName"].str.strip()

In [19]:
cities = orig["BusinessName"].unique()

# sort them alphabetically and then take a closer look
cities.sort()
cities

array(['', '"career advisor"', '"job offer" - mystery shopper', ...,
       '©2018 sights on service inc', 'æternity crypto foundation',
       '“lionel”'], dtype=object)

In [36]:
matches = fuzzywuzzy.process.extract("amazon", cities, limit=200, scorer=fuzzywuzzy.fuzz.partial_ratio)

In [37]:
matches

[('@amazonprofits.org at amazon profits.org.', 100),
 ('acting as amazon recruiter', 100),
 ('acting as amazonprofits.co', 100),
 ('amazon', 100),
 ('amazon  job scam', 100),
 ('amazon - fake', 100),
 ('amazon affiliate', 100),
 ('amazon associate website impostor', 100),
 ('amazon business associates', 100),
 ('amazon cache website', 100),
 ('amazon cadh', 100),
 ('amazon careers', 100),
 ('amazon cash', 100),
 ('amazon cash website', 100),
 ('amazon cash websites', 100),
 ('amazon cashe website', 100),
 ('amazon company name spoofing', 100),
 ('amazon corporate llc imposter', 100),
 ('amazon data jobs', 100),
 ('amazon data jobs imposter', 100),
 ('amazon employment offer imposter', 100),
 ('amazon employment scam', 100),
 ('amazon fast inspect goods', 100),
 ('amazon fast inspect llc', 100),
 ('amazon fast inspect my pack llc', 100),
 ('amazon flex', 100),
 ('amazon flex advertising logo', 100),
 ('amazon from home', 100),
 ('amazon from home imposter', 100),
 ('amazon from home.org

In [None]:
def replace_matches_in_column(df, column, string_to_match, min_ratio = 80):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")