In [1]:
import networkx as nx
import os
import pickle

from datasketch import MinHash, MinHashLSH
from nameparser import HumanName

#### Read bipartite graph and extract names

In [2]:
bipartite_all = nx.read_gpickle(os.getcwd() + '/Pickle/bipartite-all010420.pickle')
bipartite_all.number_of_nodes()

4054832

Nodes from *bipartite_all* have the following attributes:
1. degree: int
2. name: string
3. is_trusted: bool
4. is_employee: bool
5. bipartite: int

Access: bipartite_all.nodes[0]

In [3]:
bipartite_all.nodes[0]

{'degree': 17,
 'name': 'JOSEPH GALLO',
 'is_trustee': True,
 'is_employee': True,
 'bipartite': 1}

In [4]:
bipartite_names = list(bipartite_all.nodes(data = 'name'))  # extract only names
bipartite_names[: 5]

[(0, 'JOSEPH GALLO'),
 (1, 'EMMA DUNCH'),
 (2, 'DONALD BORROR'),
 (3, 'LINDA MOFFITT'),
 (4, "M'LISS DORRANCE")]

#### Read LittleSis names

In [5]:
littlesis_names = {}

with open(os.getcwd() + '/Pickle/all_people.pickle', 'rb') as pkl:
    littlesis_names = pickle.load(pkl)
    
len(littlesis_names)

205548

In [8]:
littlesis_names['1024']  # key is an ent ID

'Roland A Hernandez'

#### Clean names

In [10]:
def cleanNames(names_dict):
    clean_names = {}
    error_count, error_list = 0, []
    
    for name_id in names_dict:
        try:
            name = names_dict[name_id].strip().upper()

            human_name = HumanName(name)

            names_list = [human_name.first, human_name.middle, human_name.last]
            names_list = [n for n in names_list if len(n) > 0]

            if type(name_id) == str:  # convert string ent IDs to int
                name_id = int(name_id)
                
            clean_names[name_id] = ' '.join(names_list)
        
        except:
            error_count += 1
            error_list.append(name_id)
            
#     print(error_count, error_list[: 3])
    print(error_count)
    return clean_names

In [11]:
clean_littlesis_names = cleanNames(littlesis_names)

0


In [13]:
clean_littlesis_names[1024]

'ROLAND A HERNANDEZ'

In [18]:
max_num = 1000  # max number of names from bipartite_all, change this value to change the size of the bipartite_all subset!
bipartite_names_temp = {item[0]: item[1] for item in bipartite_names[: max_num] if item[1] is not None}
clean_bipartite_names = cleanNames(bipartite_names_temp)
remaining_bipartite = len(clean_bipartite_names)

0


In [19]:
print(len(clean_littlesis_names), len(clean_bipartite_names))

205548 740


#### Hash

In [44]:
lsh = MinHashLSH(threshold = 0.5, num_perm = 128)

for i, name in enumerate(clean_bipartite_names):
    if i % 100 == 0:
        print('Inserting', i, 'out of', remaining_bipartite)
    
    p = MinHash(num_perm = 128)
    
    for name_split in name.split():
        p.update(name_split.encode('utf-8'))
        
    lsh.insert(name, p)

Inserting 0 out of 740
Inserting 100 out of 740
Inserting 200 out of 740
Inserting 300 out of 740
Inserting 400 out of 740
Inserting 500 out of 740
Inserting 600 out of 740
Inserting 700 out of 740


#### Compare names

In [45]:
def compareNames(person1, person2):
    same_person_flag = False
    
    if person1 != person2:
        names1 = person1.split()
        names2 = person2.split()
        
        common_names = list(set(names1) & set(names2))  # intersection
        
        # we need names of min len = 3 to be same
        common_names_min_len = [n for n in common_names if len(n) >= 3]
        
        if len(common_names_min_len) > 1:  # if at least 2 names satisfy min len condition
            if len(common_names) == len(names1) or len(common_names) == len(names2):
                same_person_flag = True
            
            elif (len(names1) - len(common_names) == 1 or len(names2) - len(common_names) == 1) and (len(names1) - len(common_names) <= 2 or len(names2) - len(common_names) <= 2) and names1[0] == names2[0] and names1[-1] == names2[-1]:
                neither = list(set(names1).union(set(names2)) - (set(names1) & set(names2)))  # neither i.e. complement = union - intersection
                
                if len(neither) == 2 and (len(neither[0]) == 1 or len(neither[1] == 1)) and neither[0][0] == neither[1][0]:
                    same_person_flag = True
    
    return same_person_flag

In [48]:
clean_littlesis_names_id = {num: n for num, n in enumerate(clean_littlesis_names)}  # the number should be the key as the names don't 
clean_bipartite_names_id = {num: n for num, n in enumerate(clean_bipartite_names)}
print(len(clean_littlesis_names_id), len(clean_bipartite_names_id))

205548 740


In [51]:
matched_names = {}

for i, name in enumerate(clean_littlesis_names):  # for each name in littlesis data, find matches in the hash
    if i % 100 == 0:
        print('Comparing', i)
        
    p = MinHash(num_perm = 128)
    
    for name_split in name.split():
        p.update(name_split.encode('utf-8'))
    
    res = lsh.query(p)
    
    for bip_name in res:
        same_person = compareNames(name, bip_name)
        
        if same_person:
            matched_names[clean_littlesis_names_id[i]] = clean_bipartite_names_id[bip_name]

print(len(matched_names))

# with open(os.getcwd() + '/Pickle/matched_names.pickle', 'wb') as pkl:
#     pickle.dump(matched_names, pkl, pickle.HIGHEST_PROTOCOL)
    
print('Done')

Comparing 0
Comparing 100
Comparing 200
Comparing 300
Comparing 400
Comparing 500


KeyError: 'JOSHUA SMITH'