In [17]:
import networkx as nx
import os
import pickle

from datasketch import MinHash, MinHashLSH
from nameparser import HumanName

#### Read bipartite graph and extract names

In [2]:
bipartite_all = nx.read_gpickle(os.getcwd() + '/Pickle/bipartite-all010420.pickle')
bipartite_all.number_of_nodes()

4054832

Nodes from *bipartite_all* have the following attributes:
1. degree: int
2. name: string
3. is_trusted: bool
4. is_employee: bool
5. bipartite: int

Access: bipartite_all.nodes[0]

In [4]:
bipartite_all.nodes[0]

{'degree': 17,
 'name': 'JOSEPH GALLO',
 'is_trustee': True,
 'is_employee': True,
 'bipartite': 1}

In [20]:
bipartite_names = list(bipartite_all.nodes(data = 'name'))  # extract only names
bipartite_names[: 5]

[(0, 'JOSEPH GALLO'),
 (1, 'EMMA DUNCH'),
 (2, 'DONALD BORROR'),
 (3, 'LINDA MOFFITT'),
 (4, "M'LISS DORRANCE")]

#### Read LittleSis names

In [8]:
littlesis_names = []

with open(os.getcwd() + '/Pickle/extracted_names.pickle', 'rb') as pkl:
    littlesis_names = pickle.load(pkl)
    
len(littlesis_names)

59

#### Clean names

In [33]:
def cleanNames(names_list):
    clean_names = []
    error_count, error_list = 0, []
    
    for name in names_list:
        try:
            name = name.strip().upper()

            human_name = HumanName(name)

            names_list = [human_name.first, human_name.middle, human_name.last]
            names_list = [n for n in names_list if len(n) > 0]

            clean_names.append(' '.join(names_list))
        
        except:
            error_count += 1
            error_list.append(name)
            
    print(error_count, error_list[: 3])
    return clean_names

In [34]:
clean_littlesis_names = cleanNames(littlesis_names)

max_num = 1000 # max number of names from bipartite_all
clean_bipartite_names = cleanNames([item[1] for item in bipartite_names[: max_num]])

0 []
260 [None, None, None]


#### Hash

In [35]:
lsh = MinHashLSH(threshold = 0.5, num_perm = 128)

for i, name in enumerate(clean_bipartite_names):
    p = MinHash(num_perm = 128)
    
    for name_split in name.split():
        p.update(name_split.encode('utf-8'))
        
    lsh.insert(name, p)