In [1]:
import networkx as nx
import os
import pandas as pd
import pickle

from datasketch import MinHash, MinHashLSH

#### Read LittleSis orgs

In [2]:
littlesis_orgs = {}

with open(os.getcwd() + '/Pickle/all_orgs.pickle', 'rb') as pkl:
    littlesis_orgs = pickle.load(pkl)
    
len(littlesis_orgs)

78526

In [45]:
littlesis_df = pd.DataFrame.from_dict(list(littlesis_orgs.items()))
littlesis_df.rename(columns = {0: 'id', 1: 'org'}, inplace = True)
littlesis_df.head()

Unnamed: 0,id,org
0,1,Walmart
1,2,ExxonMobil
2,3,Chevron
3,4,General Motors Company
4,5,ConocoPhillips


#### Read bipartite graph

In [3]:
bipartite_all = nx.read_gpickle(os.getcwd() + '/Pickle/bipartite-all010420.pickle')
bipartite_all.number_of_nodes()

4054832

In [46]:
bipartite_orgs = list(bipartite_all.nodes(data = 'Uppername'))  # nodes which don't have this key return None eg. (bipartite_id, None)
bipartite_orgs[: 5]

[(0, None), (1, None), (2, None), (3, None), (4, None)]

In [47]:
bipartite_orgs = [(bipartite_id, uppername) for bipartite_id, uppername in bipartite_orgs if uppername is not None]  # remove nodes where uppername is None i.e. not organisations
bipartite_orgs[: 5]

[(8388614, 'MISSIONS UNLIMITED'),
 (8388623, 'CORVALLIS-OSU SYMPHONY SOCIETY'),
 (8388624, 'OLD CHURCH SOCIETY INC'),
 (8388646, 'TVW INC'),
 (8388664, 'EDWARD F ARNOLD TRUST')]

In [48]:
len(bipartite_orgs)

696376

In [49]:
bipartite_df = pd.DataFrame(bipartite_orgs, columns = ['id', 'org'])
bipartite_df.head()

Unnamed: 0,id,org
0,8388614,MISSIONS UNLIMITED
1,8388623,CORVALLIS-OSU SYMPHONY SOCIETY
2,8388624,OLD CHURCH SOCIETY INC
3,8388646,TVW INC
4,8388664,EDWARD F ARNOLD TRUST


#### Clean org names

In [50]:
def cleanNames(df):
    if df['id'].dtype != 'int64':  # convert ids to int
        df['id'] = df['id'].apply(lambda x: int(x))
        
    # remove periods and commas, replace - with a single space, convert to upper case and strip leading and trailing whitespaces
    df['org'] = df['org'].apply(lambda x: x.replace('.', '').replace(',', '').replace('-', ' ').upper().strip())
    
    # other things to do (potentially):
    # 1. remove INC., LTD.
    # 2. remove special chars
    # also see Pyjanitor clean names: https://pyjanitor.readthedocs.io/reference/janitor.functions/janitor.clean_names.html
    
    return df

In [51]:
littlesis_df = cleanNames(littlesis_df)
littlesis_df.head()

Unnamed: 0,id,org
0,1,WALMART
1,2,EXXONMOBIL
2,3,CHEVRON
3,4,GENERAL MOTORS COMPANY
4,5,CONOCOPHILLIPS


In [52]:
bipartite_df = cleanNames(bipartite_df)
bipartite_df.head()

Unnamed: 0,id,org
0,8388614,MISSIONS UNLIMITED
1,8388623,CORVALLIS OSU SYMPHONY SOCIETY
2,8388624,OLD CHURCH SOCIETY INC
3,8388646,TVW INC
4,8388664,EDWARD F ARNOLD TRUST


In [67]:
clean_littlesis_orgs = littlesis_df.set_index('id')['org'].to_dict()

# max_num selects the bipartite subset size, select the second option to use the entire subset
max_num = 1000
# max_num = bipartite_df.shape[0]
clean_bipartite_orgs = bipartite_df.set_index('id')['org'][: max_num].to_dict()
clean_bipartite_orgs_rev = {clean_bipartite_orgs[i]: i for i in clean_bipartite_orgs}

# print(len(clean_littlesis_orgs), len(clean_bipartite_orgs), len(clean_bipartite_orgs_rev))

#### Hash

In [74]:
remaining_bipartite = len(clean_bipartite_orgs)
count, rep_count, reps = 0, 0, set()
lsh = MinHashLSH(threshold = 0.5, num_perm = 128)

for org_id in clean_bipartite_orgs:
    p = MinHash(num_perm = 128)
    
    org = clean_bipartite_orgs[org_id]
    
    for org_split in org.split():
        p.update(org_split.encode('utf-8'))
        
    try:
        lsh.insert(org, p)
        count += 1
        if count % 100 == 0:
            print('Inserting', count)  #, 'out of', remaining_bipartite)
    except:
#         print(org)
        rep_count += 1
        reps.add(org)
    
# print('Repeated orgs:', rep_count)
# print(reps)

Inserting 100
Inserting 200
Inserting 300
Inserting 400
Inserting 500
Inserting 600
Inserting 700
Inserting 800


#### Compare org names

In [76]:
def compare(org1, org2):
    same_flag = False
    blacklist = ('INC', 'LTD')
    
    if org1 == org2:  # if strings match exactly
        return True
    
    org1_split = org1.split()
    org2_split = org2.split()
    
    common = list(set(org1_split) & set(org2_split))
    common_min = [i for i in common if i not in blacklist and len(i) >= 3]  # min len of common word is 3, word shouldn't be in the blacklist
    
    try:
        if len(common_min) > 1:  # if we have at least 2 common words from the above condition
            if len(org_split1) == len(common) or len(org_split2) == len(common):
                same_flag = True
            else:  # potentially change this to elif when we work out the required conditions
                pass
            
                # need to work out these conditions
            
    except:
        pass
    
    return same_flag

In [82]:
count = 0
matched_orgs = {}  # {littlesis_id: [bipartite_id_1, bipartite_id_2, ...], ...}

for org_id in clean_littlesis_orgs:
    count += 1
    if count % 500 == 0:
        print('Comparing', count)
        
    p = MinHash(num_perm = 128)
    
    org = clean_littlesis_orgs[org_id]
    
    for org_split in org.split():
        p.update(org_split.encode('utf-8'))
        
    res = lsh.query(p)
    
    for bip_org in res:
        if compare(org, bip_org):  # if littlesis_org matches bipartite_org
            if org_id in matched_orgs:
                matched_orgs[org_id].append(clean_bipartite_orgs_rev[bip_org])
            else: 
                matched_orgs[org_id] = [clean_bipartite_orgs_rev[bip_org]]
                
with open(os.getcwd() + '/Pickle/matched_orgs.pickle', 'wb') as pkl:
    pickle.dump(matched_orgs, pkl, pickle.HIGHEST_PROTOCOL)
    
print('Done')