In [1]:
import networkx as nx
import os
import pickle

from datasketch import MinHash, MinHashLSH
from nameparser import HumanName

#### Read bipartite graph and extract names

In [6]:
bipartite_all = nx.read_gpickle(os.getcwd() + '/Pickle/bipartite-all010420.pickle')
bipartite_all.number_of_nodes()

4054832

Nodes from *bipartite_all* have the following attributes:
1. degree: int
2. name: string
3. is_trusted: bool
4. is_employee: bool
5. bipartite: int

Access: bipartite_all.nodes[0]

In [7]:
bipartite_all.nodes[0]

{'degree': 17,
 'name': 'JOSEPH GALLO',
 'is_trustee': True,
 'is_employee': True,
 'bipartite': 1}

In [4]:
bipartite_names = list(bipartite_all.nodes(data = 'name'))  # extract only names
bipartite_names[: 5]

[(0, 'JOSEPH GALLO'),
 (1, 'EMMA DUNCH'),
 (2, 'DONALD BORROR'),
 (3, 'LINDA MOFFITT'),
 (4, "M'LISS DORRANCE")]

#### Read LittleSis names

In [8]:
littlesis_names = {}

with open(os.getcwd() + '/Pickle/all_people.pickle', 'rb') as pkl:
    littlesis_names = pickle.load(pkl)
    
len(littlesis_names)

205548

In [9]:
littlesis_names['1024']  # key is an ent ID

'Roland A Hernandez'

#### Clean names

In [10]:
def cleanNames(names_dict):
    clean_names = {}
    error_count, error_list = 0, []
    
    for name_id in names_dict:
        try:
            name = names_dict[name_id].strip().upper()

            human_name = HumanName(name)

            names_list = [human_name.first, human_name.middle, human_name.last]
            names_list = [n for n in names_list if len(n) > 0]

            if type(name_id) == str:  # convert string ent IDs to int
                name_id = int(name_id)
                
            clean_names[name_id] = ' '.join(names_list)
        
        except:
            error_count += 1
            error_list.append(name_id)
            
#     print(error_count, error_list[: 3])
    print(error_count)
    return clean_names

In [11]:
clean_littlesis_names = cleanNames(littlesis_names)

0


In [12]:
clean_littlesis_names[1024]

'ROLAND A HERNANDEZ'

In [15]:
max_num = 1000  # max number of names from bipartite_all, change this value to change the size of the bipartite_all subset!
# max_num = len(bipartite_names)  # select this to run for all names in bipartite-all
bipartite_names_temp = {item[0]: item[1] for item in bipartite_names[: max_num] if item[1] is not None}  # create dict with ID:name for bipartite-all people if their name is not None, ID is the same as node ID in the nx object
clean_bipartite_names = cleanNames(bipartite_names_temp)
clean_bipartite_names_reverse = {clean_bipartite_names[i]: i for i in clean_bipartite_names}
remaining_bipartite = len(clean_bipartite_names)

0


In [16]:
print(len(clean_littlesis_names), len(clean_bipartite_names), len(clean_bipartite_names_reverse))

205548 740 740


#### Hash

In [17]:
count = 0
lsh = MinHashLSH(threshold = 0.5, num_perm = 128)

for name_id in clean_bipartite_names:
    count += 1
    if count % 100 == 0:
        print('Inserting', count, 'out of', remaining_bipartite)
    
    p = MinHash(num_perm = 128)
    
    name = clean_bipartite_names[name_id]
    
    for name_split in name.split():
        p.update(name_split.encode('utf-8'))
        
    lsh.insert(name, p)

Inserting 100 out of 740
Inserting 200 out of 740
Inserting 300 out of 740
Inserting 400 out of 740
Inserting 500 out of 740
Inserting 600 out of 740
Inserting 700 out of 740


In [18]:
del clean_bipartite_names
print('Inserted', count, 'names from bipartite-all')

Inserted 740 names from bipartite-all


#### Compare names

In [19]:
def compareNames(person1, person2):
    same_person_flag = False
    
    if person1 != person2:
        names1 = person1.split()
        names2 = person2.split()
        
        common_names = list(set(names1) & set(names2))  # intersection
        
        # we need names of min len = 3 to be same
        common_names_min_len = [n for n in common_names if len(n) >= 3]
        
        neither = None
        
        try:
            if len(common_names_min_len) > 1:  # if at least 2 names satisfy min len condition
                if len(common_names) == len(names1) or len(common_names) == len(names2):
                    same_person_flag = True

                elif (len(names1) - len(common_names) == 1 or len(names2) - len(common_names) == 1) and (len(names1) - len(common_names) <= 2 or len(names2) - len(common_names) <= 2) and names1[0] == names2[0] and names1[-1] == names2[-1]:
                    neither = list(set(names1).union(set(names2)) - (set(names1) & set(names2)))  # neither i.e. complement = union - intersection

                    if len(neither) == 2 and (len(neither[0]) == 1 or len(neither[1] == 1)) and neither[0][0] == neither[1][0]:
                        same_person_flag = True

        except:
            print(neither, type(neither), person1)
    
    elif person1 == person2:
        same_person_flag = True
    
    return same_person_flag

In [21]:
count = 0
matched_names = {}

for name_id in clean_littlesis_names:  # for each name in littlesis data, find matches in the hash
    count += 1
    if count % 500 == 0:
        print('Comparing', count)
        
    p = MinHash(num_perm = 128)
    
    name = clean_littlesis_names[name_id]
    
    for name_split in name.split():
        p.update(name_split.encode('utf-8'))
    
    res = lsh.query(p)
    
    for bip_name in res:
        same_person = compareNames(name, bip_name)
        
        if same_person:
#             matched_names[name_id] = clean_bipartite_names_reverse[bip_name]
            # handling multiple matches
            if name_id in matched_names:
                matched_names[name_id].append(clean_bipartite_names_reverse[bip_name])  # append if we've seen at least one match for name_id before
            else:
                matched_names[name_id] = [clean_bipartite_names_reverse[bip_name]]  # create a new list if it's the first match for name_id

print(len(matched_names))

with open(os.getcwd() + '/Pickle/matched_names.pickle', 'wb') as pkl:
    pickle.dump(matched_names, pkl, pickle.HIGHEST_PROTOCOL)
    
print('Done')

0
Done


#### Parse results

In [2]:
matched_names_res = {}

with open(os.getcwd() + '/Pickle/matched_names.pickle', 'rb') as pkl:
    matched_names_res = pickle.load(pkl)
    
len(matched_names_res)

118659

In [5]:
matched_names_res[262150]  # key: LittleSis ent ID, value: list with node IDs from bipartite-all

[934460, 3191192]

In [13]:
clean_littlesis_names[262150]

'THOMAS ROONEY'

In [14]:
bipartite_all.nodes[934460]

{'degree': 6,
 'name': 'THOMAS J ROONEY',
 'is_trustee': True,
 'is_employee': True,
 'bipartite': 1}