In [1]:
from fuzzywuzzy import fuzz
import numpy as np
import pandas as pd
import os

In [2]:
pickle_dict = {}
unique_index = {}
filtered = ["filtered_adds", "filtered_ents", "filtered_peos"]
unfiltered = ["unfiltered_adds", "unfiltered_ents", "unfiltered_peos"]

for d in filtered + unfiltered:
    pickle_dict[d] = pd.read_pickle(os.path.join("data", "processed", d))
    unique_index[d] = np.array(pickle_dict[d].index.unique())
    print(f"{d} has {unique_index[d].size} unique entries")

filtered_adds has 2929 unique entries
filtered_ents has 634 unique entries
filtered_peos has 3111 unique entries
unfiltered_adds has 350767 unique entries
unfiltered_ents has 770800 unique entries
unfiltered_peos has 528468 unique entries


In [3]:
def fuzzy_match(search, search_crit, base_fuzz=fuzz.ratio):
    scores = np.vectorize(lambda x : base_fuzz(x, search))(unique_index[search_crit])
    ind = np.argpartition(scores, -10)[-10:]
    ind = ind[np.argsort(-scores[ind])]
    return (scores[ind], unique_index[search_crit][ind])

In [4]:
fuzzy_match("Barbara L Devlin", "unfiltered_peos")

array(['Barbaro Levi', 'Barbara Lenz', 'Barbara Nelson', 'Barbara Minns',
       'Barbara Jean Nelson', 'Barbara G. Abela', 'Barbara Siu',
       'Barr - David', 'Barbara Shiu', 'Brand - Kevin'], dtype=object)

In [5]:
def fuzzy_search(search, search_crit, base_fuzz=fuzz.ratio):
    scores, matches = fuzzy_match(search, search_crit, base_fuzz=base_fuzz)
    pickle_dict[search_crit].loc[matches]

In [6]:
# unfiltered dataframes have entries from all over the world
fuzzy_search("Barbara L Devlin", "unfiltered_peos")

Unnamed: 0_level_0,sourceID,country_codes
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Barbaro Levi,Panama Papers,MCO
Barbara Lenz,Offshore Leaks,ZAF
Barbara Nelson,Offshore Leaks,USA
Barbara Minns,Panama Papers,JEY
Barbara Jean Nelson,Offshore Leaks,USA
Barbara G. Abela,Offshore Leaks,XXX
Barbara Siu,Offshore Leaks,TWN
Barr - David,Paradise Papers - Appleby,USA
Barbara Shiu,Offshore Leaks,XXX
Brand - Kevin,Paradise Papers - Appleby,USA


In [7]:
# filtered dataframes only have verified Canadian entries
fuzzy_search("Barbaro Levi", search_crit="filtered_peos")

Unnamed: 0_level_0,sourceID
name,Unnamed: 1_level_1
Barry Revzen,Panama Papers
Barry Revzen,Panama Papers
Barnard - Scott Rae,Paradise Papers - Appleby
Burgher - Kevin,Paradise Papers - Appleby
Barbosa - Carlos,Paradise Papers - Appleby
Terry Lem,Offshore Leaks
WU Bao Lu,Offshore Leaks
Bricel - Mark Leon,Paradise Papers - Appleby
Gray - Kevin D,Paradise Papers - Appleby
Harry L. Stemp,Offshore Leaks


In [8]:
fuzzy_search("holdings", search_crit="filtered_ents", base_fuzz=fuzz.partial_ratio)

Unnamed: 0_level_0,sourceID
name,Unnamed: 1_level_1
Eight International Holdings Ltd.,Panama Papers
Maya Investments Holdings Trading Ltd.,Panama Papers
Seven International Holdings Ltd.,Panama Papers
Brookfield Private Equity Group Holdings LP,Paradise Papers - Appleby
Blue Seymour Holdings Limited,Panama Papers
Amisk Holdings Ltd.,Panama Papers
Torco Investment Holdings Ltd.,Panama Papers
Hazen Holdings Ltd.,Panama Papers
Munich Holdings Agency Account,Paradise Papers - Appleby
Henri Investments Holdings Ltd.,Panama Papers
