In [16]:
import networkx as nx
import os
import pandas as pd
import pickle

from datasketch import MinHash, MinHashLSH
from nameparser import HumanName

#### Read bipartite graph and extract names

In [2]:
bipartite_all = nx.read_gpickle(os.getcwd() + '/Pickle/bipartite-all010420.pickle')
bipartite_all.number_of_nodes()

4054832

Nodes from *bipartite_all* have the following attributes:
1. degree: int
2. name: string
3. is_trusted: bool
4. is_employee: bool
5. bipartite: int

Access: bipartite_all.nodes[0]

In [3]:
bipartite_all.nodes[0]

{'degree': 17,
 'name': 'JOSEPH GALLO',
 'is_trustee': True,
 'is_employee': True,
 'bipartite': 1}

In [4]:
bipartite_names = list(bipartite_all.nodes(data = 'name'))  # extract only names
bipartite_names[: 5]

[(0, 'JOSEPH GALLO'),
 (1, 'EMMA DUNCH'),
 (2, 'DONALD BORROR'),
 (3, 'LINDA MOFFITT'),
 (4, "M'LISS DORRANCE")]

#### Read LittleSis names

In [5]:
littlesis_names = {}

with open(os.getcwd() + '/Pickle/all_people.pickle', 'rb') as pkl:
    littlesis_names = pickle.load(pkl)
    
len(littlesis_names)

205548

In [6]:
littlesis_names['1024']  # key is an ent ID

'Roland A Hernandez'

#### Clean names

In [7]:
def cleanNames(names_dict):
    clean_names = {}
    error_count, error_list = 0, []
    
    for name_id in names_dict:
        try:
            name = names_dict[name_id].strip().upper()

            human_name = HumanName(name)

            names_list = [human_name.first, human_name.middle, human_name.last]
            names_list = [n for n in names_list if len(n) > 0]

            if type(name_id) == str:  # convert string ent IDs to int
                name_id = int(name_id)
                
            clean_names[name_id] = ' '.join(names_list)
        
        except:
            error_count += 1
            error_list.append(name_id)
            
#     print(error_count, error_list[: 3])
    print(error_count)
    return clean_names

In [8]:
clean_littlesis_names = cleanNames(littlesis_names)

0


In [9]:
clean_littlesis_names[1024]

'ROLAND A HERNANDEZ'

In [15]:
max_num = 1000  # max number of names from bipartite_all, change this value to change the size of the bipartite_all subset!
# max_num = len(bipartite_names)  # select this to run for all names in bipartite-all
bipartite_names_temp = {item[0]: item[1] for item in bipartite_names[: max_num] if item[1] is not None}  # create dict with ID:name for bipartite-all people if their name is not None, ID is the same as node ID in the nx object
clean_bipartite_names = cleanNames(bipartite_names_temp)
clean_bipartite_names_reverse = {clean_bipartite_names[i]: i for i in clean_bipartite_names}
remaining_bipartite = len(clean_bipartite_names)

0


In [16]:
print(len(clean_littlesis_names), len(clean_bipartite_names), len(clean_bipartite_names_reverse))

205548 740 740


#### Hash

In [17]:
count = 0
lsh = MinHashLSH(threshold = 0.5, num_perm = 128)

for name_id in clean_bipartite_names:
    count += 1
    if count % 100 == 0:
        print('Inserting', count, 'out of', remaining_bipartite)
    
    p = MinHash(num_perm = 128)
    
    name = clean_bipartite_names[name_id]
    
    for name_split in name.split():
        p.update(name_split.encode('utf-8'))
        
    lsh.insert(name, p)

Inserting 100 out of 740
Inserting 200 out of 740
Inserting 300 out of 740
Inserting 400 out of 740
Inserting 500 out of 740
Inserting 600 out of 740
Inserting 700 out of 740


In [18]:
del clean_bipartite_names
print('Inserted', count, 'names from bipartite-all')

Inserted 740 names from bipartite-all


#### Compare names

In [19]:
def compareNames(person1, person2):
    same_person_flag = False
    
    if person1 != person2:
        names1 = person1.split()
        names2 = person2.split()
        
        common_names = list(set(names1) & set(names2))  # intersection
        
        # we need names of min len = 3 to be same
        common_names_min_len = [n for n in common_names if len(n) >= 3]
        
        neither = None
        
        try:
            if len(common_names_min_len) > 1:  # if at least 2 names satisfy min len condition
                if len(common_names) == len(names1) or len(common_names) == len(names2):
                    same_person_flag = True

                elif (len(names1) - len(common_names) == 1 or len(names2) - len(common_names) == 1) and (len(names1) - len(common_names) <= 2 or len(names2) - len(common_names) <= 2) and names1[0] == names2[0] and names1[-1] == names2[-1]:
                    neither = list(set(names1).union(set(names2)) - (set(names1) & set(names2)))  # neither i.e. complement = union - intersection

                    if len(neither) == 2 and (len(neither[0]) == 1 or len(neither[1] == 1)) and neither[0][0] == neither[1][0]:
                        same_person_flag = True

        except:
            print(neither, type(neither), person1)
    
    elif person1 == person2:
        same_person_flag = True
    
    return same_person_flag

In [21]:
count = 0
matched_names = {}

for name_id in clean_littlesis_names:  # for each name in littlesis data, find matches in the hash
    count += 1
    if count % 500 == 0:
        print('Comparing', count)
        
    p = MinHash(num_perm = 128)
    
    name = clean_littlesis_names[name_id]
    
    for name_split in name.split():
        p.update(name_split.encode('utf-8'))
    
    res = lsh.query(p)
    
    for bip_name in res:
        same_person = compareNames(name, bip_name)
        
        if same_person:
#             matched_names[name_id] = clean_bipartite_names_reverse[bip_name]
            # handling multiple matches
            if name_id in matched_names:
                matched_names[name_id].append(clean_bipartite_names_reverse[bip_name])  # append if we've seen at least one match for name_id before
            else:
                matched_names[name_id] = [clean_bipartite_names_reverse[bip_name]]  # create a new list if it's the first match for name_id

print(len(matched_names))

with open(os.getcwd() + '/Pickle/matched_names.pickle', 'wb') as pkl:
    pickle.dump(matched_names, pkl, pickle.HIGHEST_PROTOCOL)
    
print('Done')

0
Done


#### Parse results

In [10]:
matched_names_res = {}

with open(os.getcwd() + '/Pickle/matched_names.pickle', 'rb') as pkl:
    matched_names_res = pickle.load(pkl)
    
len(matched_names_res)

118659

In [12]:
matched_names_res[262150]  # key: LittleSis ent ID, value: list with node IDs from bipartite-all

[934460, 3191192]

In [13]:
clean_littlesis_names[262150]

'THOMAS ROONEY'

In [14]:
for i in matched_names_res[262150]:
    print(bipartite_all.nodes[i]['name'])

THOMAS J ROONEY
THOMAS M ROONEY


For every key i.e. LittleSis name in matched_names, we need the filtered LittleSis relations for that key

In [17]:
rels = pd.read_pickle(os.getcwd() + '/Pickle/all_rels.pkl')  # read LittleSis relationships
rels.shape

(1230815, 15)

In [19]:
ents = pd.read_pickle(os.getcwd() + '/Pickle/all_entities.pkl')  # read LittleSis entities
ents.shape

(284500, 13)

In [30]:
matched_ents = ents[ents['id'].isin(matched_names_res.keys())]  # keep only those ents that appear in matched_names_res
matched_ents.shape  # same len as matched_names_res

(118659, 13)

In [52]:
type_val_counts = matched_ents.explode('types')['types'].value_counts()
reqd_types = ['Philanthropy', 'Other Not-for-Profit', 'Academic Research Institute', 'School', 'Cultural/Arts', 'Policy/Think Tank']  # 'Academic' has been removed
count = 0

for i in type_val_counts.index:
    if i in reqd_types:
        print(i)
        count += 1
        
if count == 0:
    print('None present')

Policy/Think Tank


In [44]:
def filterByType(df):  # returns indices of rows that satisfy the filter by 'type' criterion
    
    df_exploded = df.explode('types')
    
    return df_exploded[df_exploded['types'].isin(reqd_types)].index

index_to_keep = filterByType(matched_ents)
filtered_ents = ents.loc[index_to_keep]
filtered_ents

Unnamed: 0,summary,parent_id,extensions,website,name,primary_ext,aliases,blurb,types,updated_at,id,start_date,end_date
339139,,,"{'Person': {'name_nick': None, 'nationality': ...",,Nicole Theis,Person,[Nicole Theis],founder and president of the Delaware Family ...,"[Person, Lobbyist, Lobbying Firm, Policy/Think...",2019-02-12T21:10:10Z,339139,,


In [34]:
matched_rels = rels[rels['entity1_id'].isin(matched_ents['id']) | rels['entity2_id'].isin(matched_ents['id'])]  # keep only relationships where at least one node appears in matched_ents
matched_rels.shape

(861244, 15)

In [122]:
bipartite_all[2600811] # len = 17

AtlasView({6950337: {}})

In [114]:
bipartite_all.nodes[1295776]

{'degree': 2,
 'name': 'JAMES GOLDSTON',
 'is_trustee': True,
 'is_employee': True,
 'bipartite': 1}

In [123]:
bipartite_all.nodes[6950337]['Uppername']

'ALTUS GLOBAL ALLIANCE'

In [65]:
matched_res_counter = {i: {j: 0 for j in matched_names_res[i]} for i in matched_names_res}  # {littlesis id: {bipartite-all id: 0, ...}, ...}
len(matched_res_counter)

118659

In [68]:
temp_d = {262162: {1295776: 0, 2600811: 0}}
temp_d

{262162: {1295776: 0, 2600811: 0}}

In [113]:
# x = matched_rels[matched_rels['entity1_id'].isin([262162]) | matched_rels['entity2_id'].isin([262162])]
# x

In [112]:
# y = x[x['entity1_id'] != 262162]['entity1_id'].append(x[x['entity2_id'] != 262162]['entity2_id'])
# y

In [111]:
# ents[ents['id'].isin(y)][['name', 'id']]

In [120]:
def counting_rels():  # compare LittleSis relations with bipartite-all relations
    for ent_id in temp_d:
        relations = matched_rels[matched_rels['entity1_id'].isin([ent_id]) | matched_rels['entity2_id'].isin([ent_id])]
        related_ents_id = relations[relations['entity1_id'] != ent_id]['entity1_id'].append(relations[relations['entity2_id'] != ent_id]['entity2_id'])
        related_ents = ents[ents['id'].isin(related_ents_id)][['name', 'id']]
        
        for bip_match in temp_d[ent_id]:
            names = [bipartite_all.nodes[i]['Uppername'] for i in bipartite_all[bip_match]]
            print(names)
            
            # compare names with related_ents['name']
            
            
        
#     print(related_ents)
counting_rels()

['SAVING GRACE ANIMALS FOR ADOPTION INC', 'OPEN SOCIETY INSTITUTE']
['ALTUS GLOBAL ALLIANCE']
           name     id
86189  ABC News  86189
