In [1]:
from fuzzywuzzy import fuzz
import numpy as np
import pandas as pd
import os

In [2]:
pickle_dict = {}
unique_index = {}
filtered = ["filtered_adds", "filtered_ents", "filtered_peos"]
unfiltered = ["unfiltered_adds", "unfiltered_ents", "unfiltered_peos"]
canlii = ["canlii_party"]

for d in filtered + unfiltered + canlii:
    pickle_dict[d] = pd.read_pickle(os.path.join("data", "processed", d+".pickle"))
    unique_index[d] = np.array(pickle_dict[d].name.unique())
    print(f"{d} has {unique_index[d].size} unique entries")

filtered_adds has 1617 unique entries
filtered_ents has 55 unique entries
filtered_peos has 1644 unique entries
unfiltered_adds has 199320 unique entries
unfiltered_ents has 286911 unique entries
unfiltered_peos has 286111 unique entries
canlii_party has 64900 unique entries


In [3]:
def fuzzy_match(search, search_crit, base_fuzz=fuzz.ratio):
    search_df = pickle_dict[search_crit]
    scores = np.vectorize(lambda x : base_fuzz(x, search))(unique_index[search_crit])
    ind = np.argpartition(scores, -5)[-5:]
    ind = ind[np.argsort(-scores[ind])]
    return dict(zip(unique_index[search_crit][ind], scores[ind]))

In [4]:
fuzzy_match("Barbara L Devlin", "unfiltered_peos")

{'Barr - David': 64,
 'Barber - Colin C': 62,
 'Brand - Kevin': 62,
 'Barlaba - Ben': 62,
 'Harrigan - Kevin': 62}

In [5]:
def fuzzy_search(search, search_crit, base_fuzz=fuzz.ratio):
    scores = fuzzy_match(search, search_crit, base_fuzz=base_fuzz)
    search_df = pickle_dict[search_crit].set_index("name").loc[scores.keys()]
    search_df = search_df[:5]
    search_df["scores"] = scores.values()
    return search_df

In [6]:
# unfiltered dataframes have entries from all over the world
fuzzy_search("Barbara L Devlin", "unfiltered_peos")

Unnamed: 0_level_0,country_codes,sourceID,url,scores
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Barr - David,USA,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,64
Barber - Colin C,GBR,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,62
Brand - Kevin,USA,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,62
Barlaba - Ben,BMU,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,62
Harrigan - Kevin,USA,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,62


In [7]:
# filtered dataframes only have verified Canadian entries
fuzzy_search("Barbaro Levi", search_crit="filtered_peos")

Unnamed: 0_level_0,country_codes,sourceID,url,scores
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Barnard - Scott Rae,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,52
Burgher - Kevin,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,52
Barbosa - Carlos,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,50
Bricel - Mark Leon,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,47
Gray - Kevin D,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,46


In [8]:
fuzzy_search("Golden Trim Enterprises Inc ", search_crit="filtered_ents", base_fuzz=fuzz.partial_ratio)

Unnamed: 0_level_0,country_codes,sourceID,url,scores
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IPL Enterprises Inc.,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,80
IPL Enterprises Inc. (CAD),CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,74
Lenka Trust,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,45
Sonora Diamond Corporation Ltd.,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,43
Kinnear,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,43


In [9]:
fuzzy_search("45 Springfield Building", search_crit="canlii_party", base_fuzz=fuzz.ratio)

Unnamed: 0_level_0,court,url,year,scores
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Starfield Holdings Ltd,bcsc,https://www.canlii.org/en/bc/bcsc/doc/1997/199...,1997.0,58
Litchfield Holdings,bcsc,https://www.canlii.org/en/bc/bcsc/doc/2002/200...,2002.0,57
Martel Building Ltd,fct,https://www.canlii.org/fr/ca/cfpi/doc/1997/199...,1997.0,57
Martel Building Ltd,scc,https://www.canlii.org/en/ca/scc/doc/2000/2000...,2000.0,55
Maligne Buildings Ltd,fct,https://www.canlii.org/en/ca/fct/doc/1982/1982...,1982.0,53


In [10]:
def offshore_leaks_search_address(address):
    offshore_df = fuzzy_search(address, search_crit="filtered_adds")
    return offshore_df

In [11]:
offshore_leaks_search_address("45 Springfield Building")

Unnamed: 0_level_0,country_codes,sourceID,url,scores
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
45 Springfield Building,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,100
7 Springfield Crescent,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,62
The Fortis Building,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,57
British Colonal Building,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,55
The Block Building,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,54


In [12]:
def canlii_search_address(address):
    canlii_df = fuzzy_search(address, search_crit="canlii_party")
    return canlii_df

In [13]:
def offshore_leaks_search_entity(entity):
    offshore_df = pd.concat([fuzzy_search(entity, search_crit="filtered_ents"),
                             fuzzy_search(entity, search_crit="unfiltered_ents")],
                            axis=0).drop_duplicates()
    return offshore_df

In [14]:
def canlii_search_entity(entity):
    canlii_df = fuzzy_search(entity, search_crit="canlii_party")
    return canlii_df

In [15]:
offshore_leaks_search_entity("Microsoft")

Unnamed: 0_level_0,country_codes,sourceID,url,scores
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Munich Holdings Agency Account,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,36
Lenka Trust,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,30
Sonora Diamond Corporation Ltd.,CAN,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,25
Magic Trust,,Paradise Papers - Nevis corporate registry,,60
Anic Trust,,Paradise Papers - Nevis corporate registry,,53
Morison Ltd,MLT,Paradise Papers - Malta corporate registry,https://offshoreleaks.icij.org/search?utf8=%E2...,50
Kaico Trust,,Paradise Papers - Appleby,,50
Macaw Trust,JAM,Paradise Papers - Appleby,https://offshoreleaks.icij.org/search?utf8=%E2...,50


In [16]:
def offshore_leaks_search_people(people):
    offshore_df = pd.concat([fuzzy_search(people, search_crit="filtered_peos"),
                             fuzzy_search(people, search_crit="unfiltered_peos")],
                            axis=0).drop_duplicates()
    return offshore_df

In [17]:
def canlii_search_people(people):
    canlii_df = fuzzy_search(people, search_crit="canlii_party")
    return canlii_df

In [18]:
# pd.options.display.max_rows
# pd.set_option('display.max_colwidth', -1)
canlii_search_people("Junghoo Kim")

Unnamed: 0_level_0,court,url,year,scores
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jhooti,bchrt,https://www.canlii.org/en/bc/bchrt/doc/2009/20...,2009.0,59
Woo Lim,fct,https://www.canlii.org/en/ca/fct/doc/2012/2012...,2012.0,56
Uhuangho,fct,https://www.canlii.org/en/ca/fct/doc/2005/2005...,2005.0,53
Jung,fct,https://www.canlii.org/en/ca/fct/doc/2015/2015...,2015.0,53
Jung,fct,https://www.canlii.org/en/ca/fct/doc/2014/2014...,2014.0,50
