In [1]:
import networkx as nx
import os
import pandas as pd
import pickle

from datasketch import MinHash, MinHashLSH
from nameparser import HumanName

#### Read bipartite graph and extract names

In [2]:
bipartite_all = nx.read_gpickle(os.getcwd() + '/Pickle/bipartite-all010420.pickle')
bipartite_all.number_of_nodes()

4054832

Nodes from *bipartite_all* have the following attributes:
1. degree: int
2. name: string
3. is_trusted: bool
4. is_employee: bool
5. bipartite: int

Access: bipartite_all.nodes[0]

In [3]:
bipartite_all.nodes[0]

{'degree': 17,
 'name': 'JOSEPH GALLO',
 'is_trustee': True,
 'is_employee': True,
 'bipartite': 1}

In [4]:
bipartite_names = list(bipartite_all.nodes(data = 'name'))  # extract only names
bipartite_names[: 5]

[(0, 'JOSEPH GALLO'),
 (1, 'EMMA DUNCH'),
 (2, 'DONALD BORROR'),
 (3, 'LINDA MOFFITT'),
 (4, "M'LISS DORRANCE")]

#### Read LittleSis names

In [5]:
littlesis_names = {}

with open(os.getcwd() + '/Pickle/all_people.pickle', 'rb') as pkl:
    littlesis_names = pickle.load(pkl)
    
len(littlesis_names)

205548

In [6]:
littlesis_names['1024']  # key is an ent ID

'Roland A Hernandez'

#### Clean names

In [7]:
def cleanNames(names_dict):
    clean_names = {}
    error_count, error_list = 0, []
    
    for name_id in names_dict:
        try:
            name = names_dict[name_id].strip().upper()

            human_name = HumanName(name)

            names_list = [human_name.first, human_name.middle, human_name.last]
            names_list = [n for n in names_list if len(n) > 0]

            if type(name_id) == str:  # convert string ent IDs to int
                name_id = int(name_id)
                
            clean_names[name_id] = ' '.join(names_list)
        
        except:
            error_count += 1
            error_list.append(name_id)
            
#     print(error_count, error_list[: 3])
    print(error_count)
    return clean_names

In [8]:
clean_littlesis_names = cleanNames(littlesis_names)

0


In [9]:
clean_littlesis_names[1024]

'ROLAND A HERNANDEZ'

In [15]:
max_num = 1000  # max number of names from bipartite_all, change this value to change the size of the bipartite_all subset!
# max_num = len(bipartite_names)  # select this to run for all names in bipartite-all
bipartite_names_temp = {item[0]: item[1] for item in bipartite_names[: max_num] if item[1] is not None}  # create dict with ID:name for bipartite-all people if their name is not None, ID is the same as node ID in the nx object
clean_bipartite_names = cleanNames(bipartite_names_temp)
clean_bipartite_names_reverse = {clean_bipartite_names[i]: i for i in clean_bipartite_names}
remaining_bipartite = len(clean_bipartite_names)

0


In [16]:
print(len(clean_littlesis_names), len(clean_bipartite_names), len(clean_bipartite_names_reverse))

205548 740 740


#### Hash

In [17]:
count = 0
lsh = MinHashLSH(threshold = 0.5, num_perm = 128)

for name_id in clean_bipartite_names:
    count += 1
    if count % 100 == 0:
        print('Inserting', count, 'out of', remaining_bipartite)
    
    p = MinHash(num_perm = 128)
    
    name = clean_bipartite_names[name_id]
    
    for name_split in name.split():
        p.update(name_split.encode('utf-8'))
        
    lsh.insert(name, p)

Inserting 100 out of 740
Inserting 200 out of 740
Inserting 300 out of 740
Inserting 400 out of 740
Inserting 500 out of 740
Inserting 600 out of 740
Inserting 700 out of 740


In [18]:
del clean_bipartite_names
print('Inserted', count, 'names from bipartite-all')

Inserted 740 names from bipartite-all


#### Compare names

In [19]:
def compareNames(person1, person2):
    same_person_flag = False
    
    if person1 != person2:
        names1 = person1.split()
        names2 = person2.split()
        
        common_names = list(set(names1) & set(names2))  # intersection
        
        # we need names of min len = 3 to be same
        common_names_min_len = [n for n in common_names if len(n) >= 3]
        
        neither = None
        
        try:
            if len(common_names_min_len) > 1:  # if at least 2 names satisfy min len condition
                if len(common_names) == len(names1) or len(common_names) == len(names2):
                    same_person_flag = True

                elif (len(names1) - len(common_names) == 1 or len(names2) - len(common_names) == 1) and (len(names1) - len(common_names) <= 2 or len(names2) - len(common_names) <= 2) and names1[0] == names2[0] and names1[-1] == names2[-1]:
                    neither = list(set(names1).union(set(names2)) - (set(names1) & set(names2)))  # neither i.e. complement = union - intersection

                    if len(neither) == 2 and (len(neither[0]) == 1 or len(neither[1] == 1)) and neither[0][0] == neither[1][0]:
                        same_person_flag = True

        except:
            print(neither, type(neither), person1)
    
    elif person1 == person2:
        same_person_flag = True
    
    return same_person_flag

In [21]:
count = 0
matched_names = {}

for name_id in clean_littlesis_names:  # for each name in littlesis data, find matches in the hash
    count += 1
    if count % 500 == 0:
        print('Comparing', count)
        
    p = MinHash(num_perm = 128)
    
    name = clean_littlesis_names[name_id]
    
    for name_split in name.split():
        p.update(name_split.encode('utf-8'))
    
    res = lsh.query(p)
    
    for bip_name in res:
        same_person = compareNames(name, bip_name)
        
        if same_person:
#             matched_names[name_id] = clean_bipartite_names_reverse[bip_name]
            # handling multiple matches
            if name_id in matched_names:
                matched_names[name_id].append(clean_bipartite_names_reverse[bip_name])  # append if we've seen at least one match for name_id before
            else:
                matched_names[name_id] = [clean_bipartite_names_reverse[bip_name]]  # create a new list if it's the first match for name_id

print(len(matched_names))

with open(os.getcwd() + '/Pickle/matched_names.pickle', 'wb') as pkl:
    pickle.dump(matched_names, pkl, pickle.HIGHEST_PROTOCOL)
    
print('Done')

0
Done


#### Parse results

In [10]:
matched_names_res = {}

with open(os.getcwd() + '/Pickle/matched_names.pickle', 'rb') as pkl:
    matched_names_res = pickle.load(pkl)
    
len(matched_names_res)

118659

In [11]:
matched_names_res[262150]  # key: LittleSis ent ID, value: list with node IDs from bipartite-all

[934460, 3191192]

In [12]:
clean_littlesis_names[262150]

'THOMAS ROONEY'

In [13]:
for i in matched_names_res[262150]:
    print(bipartite_all.nodes[i]['name'])

THOMAS J ROONEY
THOMAS M ROONEY


For every key i.e. LittleSis name in matched_names, we need the filtered LittleSis relations for that key

In [14]:
rels = pd.read_pickle(os.getcwd() + '/Pickle/all_rels.pkl')  # read LittleSis relationships
rels.shape

(1230815, 15)

In [15]:
ents = pd.read_pickle(os.getcwd() + '/Pickle/all_entities.pkl')  # read LittleSis entities
ents.shape

(284500, 13)

In [16]:
matched_ents = ents[ents['id'].isin(matched_names_res.keys())]  # keep only those ents that appear in matched_names_res
matched_ents.shape  # same len as matched_names_res

(118659, 13)

In [17]:
type_val_counts = matched_ents.explode('types')['types'].value_counts()
reqd_types = ['Philanthropy', 'Other Not-for-Profit', 'Academic Research Institute', 'School', 'Cultural/Arts', 'Policy/Think Tank']  # 'Academic' has been removed
count = 0

for i in type_val_counts.index:
    if i in reqd_types:
        print(i)
        count += 1
        
if count == 0:
    print('None present')

Policy/Think Tank


In [18]:
def filterByType(df):  # returns indices of rows that satisfy the filter by 'type' criterion
    
    df_exploded = df.explode('types')
    
    return df_exploded[df_exploded['types'].isin(reqd_types)].index

index_to_keep = filterByType(matched_ents)
filtered_ents = ents.loc[index_to_keep]
filtered_ents

Unnamed: 0,summary,parent_id,extensions,website,name,primary_ext,aliases,blurb,types,updated_at,id,start_date,end_date
339139,,,"{'Person': {'name_nick': None, 'nationality': ...",,Nicole Theis,Person,[Nicole Theis],founder and president of the Delaware Family ...,"[Person, Lobbyist, Lobbying Firm, Policy/Think...",2019-02-12T21:10:10Z,339139,,


^Only one of the matched_ents fits the filter we defined.

In [19]:
matched_rels = rels[rels['entity1_id'].isin(matched_ents['id']) | rels['entity2_id'].isin(matched_ents['id'])]  # keep only relationships where at least one node appears in matched_ents
matched_rels.shape

(861244, 15)

In [20]:
matched_res_counter = {i: {j: 0 for j in matched_names_res[i]} for i in matched_names_res}  # {littlesis id: {bipartite-all id: 0, ...}, ...}
len(matched_res_counter)

118659

In [21]:
# matched_names_res[262150]

In [23]:
# matched_res_counter

In [22]:
# temp_d = {262150: {934460: 0, 3191192: 0}}
temp_d = {
    262150: {934460: 0, 3191192: 0},
    262173: {5086: 0, 942152: 0, 3149304: 0, 695971: 0, 2827107: 0, 1095414: 0},
    262197: {1670657: 0, 59093: 0, 1827625: 0, 1933255: 0, 298289: 0}
         }

temp_d

{262150: {934460: 0, 3191192: 0},
 262173: {5086: 0, 942152: 0, 3149304: 0, 695971: 0, 2827107: 0, 1095414: 0},
 262197: {1670657: 0, 59093: 0, 1827625: 0, 1933255: 0, 298289: 0}}

In [24]:
for i in bipartite_all[934460]:
    print(bipartite_all.nodes[i]['Uppername'])

GREATER GUSTAVUS FUND
GUSTAVUS ADOLPHUS COLLEGE
NY POLICE AND FIRE WIDOWS & CHILDREN'S BENEFIT FUND, INC.
POLISH FALCONS AID SOCIETY
NATIONAL ASSOCIATION OF LETTER CARRIERS
CHILDREN'S PLACE AT HOME SAFE, INC.


In [25]:
x = matched_rels[matched_rels['entity1_id'].isin([262150]) | matched_rels['entity2_id'].isin([262150])]
x  # all relationships with the LittleSis ID which is a key in matched_names_res

Unnamed: 0,category_id,description2,id,description,entity1_id,is_current,updated_at,entity2_id,currency,start_date,amount,goods,description1,end_date,filings
1295038,1,,1295038,Thomas Rooney has/had a position (Board) at ...,262150,,2017-04-29T03:20:45Z,216413,,,,,Board,,
1295039,1,,1295039,Thomas Rooney has/had a position (Superintend...,262150,,2017-04-29T03:22:33Z,262149,,,,,Superintendent,,


In [26]:
y = x[x['entity1_id'] != 262150]['entity1_id'].append(x[x['entity2_id'] != 262150]['entity2_id'])
y  # all related entity IDs to the above ent

1295038    216413
1295039    262149
dtype: int64

In [27]:
z = ents[ents['id'].isin(y)][['name', 'id']]
z['name'] = z['name'].apply(lambda x: x.upper())
z

Unnamed: 0,name,id
262149,LINDSAY UNIFIED SCHOOL DISTRICT,262149


In [32]:
def counting_rels_direct():  # compare LittleSis relations with bipartite-all relations
    i = 0
#     for ent_id in matched_res_counter:
    for ent_id in temp_d:

        i += 1
        if i % 100 == 0:
            print(i)
        
        relations = matched_rels[matched_rels['entity1_id'].isin([ent_id]) | matched_rels['entity2_id'].isin([ent_id])]
        related_ents_id = relations[relations['entity1_id'] != ent_id]['entity1_id'].append(relations[relations['entity2_id'] != ent_id]['entity2_id'])
        related_ents = ents[ents['id'].isin(related_ents_id)][['name', 'id']]
        related_ents_names = set(related_ents['name'].apply(lambda x: x.upper()))  # convert names to uppercase
        
#         for bip_match in matched_res_counter[ent_id]:
        for bip_match in temp_d[ent_id]:
            names = set([bipartite_all.nodes[i]['Uppername'] for i in bipartite_all[bip_match]])
            
#             matched_res_counter[ent_id][bip_match] = len(related_ents_names.intersection(names))  # set count to number of common names
            temp_d[ent_id][bip_match] = len(related_ents_names.intersection(names))  # set count to number of common names

        print(i, related_ents_names)

In [35]:
temp_d = {262150: {934460: 0, 3191192: 0},
 262173: {5086: 0, 942152: 0, 3149304: 0, 695971: 0, 2827107: 0, 1095414: 0},
 262197: {1670657: 0, 59093: 0, 1827625: 0, 1933255: 0, 298289: 0}}

temp_d

{262150: {934460: 0, 3191192: 0},
 262173: {5086: 0, 942152: 0, 3149304: 0, 695971: 0, 2827107: 0, 1095414: 0},
 262197: {1670657: 0, 59093: 0, 1827625: 0, 1933255: 0, 298289: 0}}

In [39]:
def counting_rels_hash():  # compare LittleSis relations with bipartite-all relations using a hash
    i = 0
#     for ent_id in matched_res_counter:
    for ent_id in temp_d:

        i += 1
        if i % 100 == 0:
            print(i)
        
        relations = matched_rels[matched_rels['entity1_id'].isin([ent_id]) | matched_rels['entity2_id'].isin([ent_id])]
        related_ents_id = relations[relations['entity1_id'] != ent_id]['entity1_id'].append(relations[relations['entity2_id'] != ent_id]['entity2_id'])
        related_ents = ents[ents['id'].isin(related_ents_id)][['name', 'id']]
        related_ents_names = set(related_ents['name'].apply(lambda x: x.upper()))  # convert names to uppercase
        
        lsh = MinHashLSH(threshold = 0.5, num_perm = 128)
        
        # insert LittleSis ents names related to current ent_id into hash
        for n in related_ents_names:
            p = MinHash(num_perm = 128)
            
            for n_split in n.split():
                p.update(n_split.encode('utf-8'))
                
            lsh.insert(n, p)
        
#         for bip_match in matched_res_counter[ent_id]:
        for bip_match in temp_d[ent_id]:
            names = set([bipartite_all.nodes[i]['Uppername'] for i in bipartite_all[bip_match]])
            
#             matched_res_counter[ent_id][bip_match] = len(related_ents_names.intersection(names))  # set count to number of common names
#             temp_d[ent_id][bip_match] = len(related_ents_names.intersection(names))  # set count to number of common names

            for bip_name in names:
                p = MinHash(num_perm = 128)
            
                for bip_name_split in bip_name.split():
                    p.update(bip_name_split.encode('utf-8'))
                    
                res = lsh.query(p)
                
                for res_name in res:
                    if bip_name == res_name:
                        temp_d[ent_id][bip_match] += 1

        print(i, related_ents_names)

In [33]:
counting_rels_direct()

1 {'LINDSAY UNIFIED SCHOOL DISTRICT'}
2 {'BEN RHODES'}
3 {'IDENTITY EVROPA', 'WILLIAM CLARK', 'CHAD TURNER'}


In [40]:
counting_rels_hash()

1 {'LINDSAY UNIFIED SCHOOL DISTRICT'}
2 {'BEN RHODES'}
3 {'IDENTITY EVROPA', 'WILLIAM CLARK', 'CHAD TURNER'}


In [41]:
temp_d

{262150: {934460: 0, 3191192: 0},
 262173: {5086: 0, 942152: 0, 3149304: 0, 695971: 0, 2827107: 0, 1095414: 0},
 262197: {1670657: 0, 59093: 0, 1827625: 0, 1933255: 0, 298289: 0}}

In [65]:
# 33551 in matched_names_res  # Bill and Melinda Gates Foundation: False
# 1526 in matched_names_res  # Bill Gates: True
# matched_names_res[1526]
# 15367 in matched_names_res
# ents[ents['id'] == 15367]
# a = [node for node in bipartite_all if 'Uppername' in bipartite_all.nodes[node] and bipartite_all.nodes[node]['Uppername']]
a = [node for node in bipartite_all if 'Uppername' in bipartite_all.nodes[node] and  'NORTHEASTERN UNIVERSITY' in bipartite_all.nodes[node]['Uppername']]

In [68]:
ents[ents['id'] == 15367]

Unnamed: 0,summary,parent_id,extensions,website,name,primary_ext,aliases,blurb,types,updated_at,id,start_date,end_date
15367,,,"{'Org': {'name_nick': None, 'name': 'Northeast...",,Northeastern University,Org,[Northeastern University],,"[Organization, School]",2020-06-15T21:58:42Z,15367,,


In [None]:
[(9726710, 'BILL & MELINDA GATES MEDICAL RESEARCH INSTITUTE'),
 (7863433, 'BILL & MELINDA GATES FOUNDATION'),
 (8330328, 'BILL & MELINDA GATES FOUNDATION TRUST')]

# also David Rubenstein, Sharon P Rockefeller <-- fix spellings

In [66]:
a

[6819740]

In [69]:
bipartite_all.nodes[0]

{'degree': 17,
 'name': 'JOSEPH GALLO',
 'is_trustee': True,
 'is_employee': True,
 'bipartite': 1}

In [67]:
bipartite_all.nodes[6819740]

{'degree': 69,
 'assets_total': 2991273000.0,
 'revenue_investments': '81124512.0',
 'rev_govt_grants': 115396102.0,
 'naics_code': '611310',
 'year_founded': nan,
 'address_line_1': '360 Huntington Avenue',
 'num_employees': 4471.0,
 'city': 'Boston',
 'expenses_total': 1536933716.0,
 'total_revenue': 1691438907.0,
 'sic_code': '8221',
 'deductibility_code': '1',
 'zip': '02115',
 'ntee_code': 'B40',
 'bipartite': 0,
 'state': 'MA',
 'Uppername': 'NORTHEASTERN UNIVERSITY',
 'num_volunteers': 1000.0,
 'ein': '04-1679980',
 'revenue_contributions': 64844898.0,
 'employees_over_100k': 20.0}

In [57]:
x = matched_rels[matched_rels['entity1_id'].isin([1526]) | matched_rels['entity2_id'].isin([1526])]
x

Unnamed: 0,category_id,description2,id,description,entity1_id,is_current,updated_at,entity2_id,currency,start_date,amount,goods,description1,end_date,filings
92982,1,Founder,92982,Bill Gates had a position (Non-Independent Di...,1526,False,2018-04-23T14:38:26Z,44,,,,,"Non-Independent Director (Board of Directors),...",2015-00-00,
131383,8,Friend (bridge partner),131383,Warren Buffett and Bill Gates are/were Frie...,14923,,2009-09-08T09:57:12Z,1526,,,,,Friend (bridge partner),,
185527,1,Co-chair and Trustee,185527,Bill Gates has a position (Co-chair and Trust...,1526,True,2009-11-19T15:46:32Z,33551,,,,,Co-chair and Trustee,,
185528,4,husband,185528,Melinda French Gates and Bill Gates are in ...,7850,True,2009-11-19T15:48:29Z,1526,,,,,wife,,
185533,4,son,185533,William H Gates Sr and Bill Gates are in a ...,1745,True,2009-11-19T15:52:15Z,1526,,,,,father,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1651365,12,Attended / Co-Host,1651365,The Good Club and Bill Gates have/had a gen...,368337,,2020-02-26T14:22:41Z,1526,,2009-00-00,,,Launched by,,
1659626,6,Purchased Stock,1659626,Crown Castle International Corp. and Bill Ga...,59519,,2020-04-07T23:14:46Z,1526,usd,2019-02-00,627000000.0,\r\nADVERTISEMENT\r\n\r\n RSS Feed \r\nHOME\r...,Purchased by,,
1659627,12,Investment Via Crown Castle,1659627,Bill Gates and 5G have/had a generic relati...,1526,,2020-04-07T23:15:23Z,353549,,2019-02-00,,,Invested in Infrastructure,,
1676411,12,At Opening,1676411,Bill and Melinda Gates Center For Computer Sci...,375363,,2020-05-14T21:07:04Z,1526,,2019-00-00,,,At Opening,,


In [58]:
y = x[x['entity1_id'] != 1526]['entity1_id'].append(x[x['entity2_id'] != 1526]['entity2_id'])
y  # all related entity IDs to the above ent

131383      14923
185528       7850
185533       1745
748852     119578
1002476      1757
            ...  
1646091     13485
1646092     13592
1646093     13731
1646094     29942
1659627    353549
Length: 127, dtype: int64

In [60]:
z = ents[ents['id'].isin(y)][['name', 'id']]
z['name'] = z['name'].apply(lambda x: x.upper())
z['name']

11                                  BERKSHIRE HATHAWAY INC.
29                             COSTCO WHOLESALE CORPORATION
44                                    MICROSOFT CORPORATION
200                                  WASTE MANAGEMENT, INC.
1745                                     WILLIAM H GATES SR
                                ...                        
282754                                          DOUG BURGUM
325413    YES ON 1240: WASHINGTON COALITION FOR PUBLIC C...
353549                                                   5G
368337                                        THE GOOD CLUB
375363    BILL AND MELINDA GATES CENTER FOR COMPUTER SCI...
Name: name, Length: 118, dtype: object

In [49]:
for i in bipartite_all[181906]:
    print(bipartite_all.nodes[i]['Uppername'])

VITALANT
INTERVARSITY CHRISTIAN FELLOWSHIP/USA
BIG BRUTUS INC
FOUNDATION FOR BLIND CHILDREN
BLOODSOURCE INC
PHOENIX SYMPHONY ASSOCIATION
LIFEWORKS INC
CANADIAN INTER-VARSITY CHRISTIAN FELLOWSHIP INC
DISABLED SPORTS USA
BOYS AND GIRLS CLUB OF JACKSON COUNTY INC
MIDWEST COMMUNITY CREDIT UNION


In [42]:
matched_graph = nx.from_pandas_edgelist(matched_rels, 'entity1_id', 'entity2_id', ['category_id'], create_using = nx.MultiGraph())

In [43]:
matched_graph.number_of_nodes()

170985

In [44]:
matched_graph.number_of_edges()

861244

In [53]:
degrees = dict(matched_graph.degree())
max_base_deg_key = max(degrees, key = degrees.get)
print(max_base_deg_key, matched_graph.degree()[max_base_deg_key])

13191 9215
