In [1]:
import networkx as nx
import numpy as np
import os
import pandas as pd
import pickle

In [2]:
ents = pd.read_pickle(os.getcwd() + '/Pickle/all_entities.pkl')
ents.shape

(284500, 13)

In [3]:
rels = pd.read_pickle(os.getcwd() + '/Pickle/all_rels.pkl')
rels.shape

(1230815, 15)

In [4]:
filtered_data = pd.read_pickle(os.getcwd() + '/Pickle/filtered_org_match.pkl')
filtered_data.shape

(7454, 18)

In [5]:
matched_names_res = {}

with open(os.getcwd() + '/Pickle/matched_names.pickle', 'rb') as pkl:
    matched_names_res = pickle.load(pkl)
    
matched_name_df = pd.DataFrame(list(matched_names_res.items())).rename(columns = {0: 'ls_id', 1: 'bp_id'})  # create df of matched name IDs
matched_name_df.shape

(118659, 2)

In [6]:
matched_name_df_exploded = matched_name_df.explode('bp_id')
matched_name_df_exploded.head()

Unnamed: 0,ls_id,bp_id
0,262146,998678
1,262147,636330
2,262150,934460
2,262150,3191192
3,262154,638094


In [7]:
bipartite_all = nx.read_gpickle(os.getcwd() + '/Pickle/bipartite-all010420.pickle')
bipartite_all.number_of_nodes()

4054832

In [8]:
filtered_data.head()[['ls_id', 'bp_id', 'ls_rels', 'bp_rels']]

Unnamed: 0,ls_id,bp_id,ls_rels,bp_rels
1236,14713,6822486,711,9
1252,14730,6830462,931,7
1284,14922,8904282,18,5
1285,14924,7871420,406,18
1286,14925,7513160,147,34


In [9]:
# x is the current org
def forwardMatch(x_ls_id):
    x_bp_id = filtered_data[filtered_data['ls_id'] == x_ls_id]['bp_id'].iloc[0]  # get corresponding bp_id
    
    # get 'person' relationships for x from LittleSis
    x_ls_rels = rels[rels['entity1_id'].isin([x_ls_id]) | rels['entity2_id'].isin([x_ls_id])]  # all LittleSis relations for x
    related_to_x_ls_id = x_ls_rels[x_ls_rels['entity1_id'] != x_ls_id]['entity1_id'].append(x_ls_rels[x_ls_rels['entity2_id'] != x_ls_id]['entity2_id']).unique()  # LittleSis IDs for all ents related to x
    related_to_x_ls = ents[ents['id'].isin(related_to_x_ls_id)]
    related_to_x_ls_names = related_to_x_ls[related_to_x_ls['primary_ext'] == 'Person'][['id', 'name']].rename(columns = {'id': 'ls_id'})
    related_to_x_ls_names.reset_index(drop = True, inplace = True)
    
    if related_to_x_ls_names.shape[0] == 0:  # if true return none else continue
        return None
    
    # get 'person' relationships for x from bipartite-all
    related_to_x_bp = {}
    
    for i in bipartite_all[x_bp_id]:
        related_to_x_bp[i] = bipartite_all.nodes[i]['name']
        
    related_to_x_bp_names = pd.DataFrame(list(related_to_x_bp.items())).rename(columns = {0: 'bp_id', 1: 'name'})
    
    all_x_matches = matched_name_df[matched_name_df['ls_id'].isin(related_to_x_ls_names['ls_id'])]
    all_x_matches_exploded = all_x_matches.explode('bp_id')
    
    final = all_x_matches_exploded[all_x_matches_exploded['bp_id'].isin(related_to_x_bp_names['bp_id'])].set_index('ls_id').join(related_to_x_ls_names.set_index('ls_id'), on = 'ls_id', lsuffix = '_left', rsuffix = '_right').reset_index().set_index('bp_id').join(related_to_x_bp_names.set_index('bp_id'), on = 'bp_id', lsuffix = '_ls', rsuffix = '_bp').reset_index()
    
#     print(final.to_dict())

    return final.to_dict()

In [10]:
abc = filtered_data[filtered_data['ls_id'].isin([15367, 33551])].copy()
abc

Unnamed: 0,ls_id,name,bp_id,match_name,match_value,summary,parent_id,extensions,website,primary_ext,aliases,blurb,types,updated_at,start_date,end_date,ls_rels,bp_rels
1435,15367,NORTHEASTERN UNIVERSITY,6819740,NORTHEASTERN UNIVERSITY,1.0,,,"{'Org': {'name_nick': None, 'name': 'Northeast...",,Org,[Northeastern University],,"[Organization, School]",2020-06-15T21:58:42Z,,,79,69
6493,33551,BILL & MELINDA GATES FOUNDATION,7863433,BILL & MELINDA GATES FOUNDATION,1.0,,,"{'Org': {'name_nick': None, 'name': 'Bill & Me...",,Org,"[Bill & Melinda Gates Foundation, Bill and Mel...",Family foundation of Microsoft founder Bill Gates,"[Organization, Philanthropy]",2020-07-14T02:25:29Z,,,420,19


In [11]:
# x is the current org
def reverseMatch(x_ls_id):
    x_bp_id = filtered_data[filtered_data['ls_id'] == x_ls_id]['bp_id'].iloc[0]  # get corresponding bp_id
    
    # get 'person' relationships for x from LittleSis
    x_ls_rels = rels[rels['entity1_id'].isin([x_ls_id]) | rels['entity2_id'].isin([x_ls_id])]  # all LittleSis relations for x
    related_to_x_ls_id = x_ls_rels[x_ls_rels['entity1_id'] != x_ls_id]['entity1_id'].append(x_ls_rels[x_ls_rels['entity2_id'] != x_ls_id]['entity2_id']).unique()  # LittleSis IDs for all ents related to x
    related_to_x_ls = ents[ents['id'].isin(related_to_x_ls_id)]
    related_to_x_ls_names = related_to_x_ls[related_to_x_ls['primary_ext'] == 'Person'][['id', 'name']].rename(columns = {'id': 'ls_id'})
    related_to_x_ls_names.reset_index(drop = True, inplace = True)
    
    if related_to_x_ls_names.shape[0] == 0:  # if true return none else continue
        return None
    
    # get 'person' relationships for x from bipartite-all
    related_to_x_bp = {}
    
    for i in bipartite_all[x_bp_id]:
        related_to_x_bp[i] = bipartite_all.nodes[i]['name']
        
    related_to_x_bp_names = pd.DataFrame(list(related_to_x_bp.items())).rename(columns = {0: 'bp_id', 1: 'name'})
    
    all_x_matches_rev = matched_name_df_exploded[matched_name_df_exploded['bp_id'].isin(related_to_x_bp_names['bp_id'])]
    final = all_x_matches_rev.set_index('ls_id').join(ents.rename(columns = {'id': 'ls_id'}).set_index('ls_id'), on = 'ls_id', lsuffix = '_left', rsuffix = '_right').reset_index().set_index('bp_id').join(related_to_x_bp_names.set_index('bp_id'), on = 'bp_id', lsuffix = '_ls', rsuffix = '_bp').reset_index()[['bp_id', 'ls_id', 'name_ls', 'name_bp']]
    
    return final.to_dict()

In [12]:
abc['rev'] = abc['ls_id'].apply(reverseMatch)
abc

Unnamed: 0,ls_id,name,bp_id,match_name,match_value,summary,parent_id,extensions,website,primary_ext,aliases,blurb,types,updated_at,start_date,end_date,ls_rels,bp_rels,rev
1435,15367,NORTHEASTERN UNIVERSITY,6819740,NORTHEASTERN UNIVERSITY,1.0,,,"{'Org': {'name_nick': None, 'name': 'Northeast...",,Org,[Northeastern University],,"[Organization, School]",2020-06-15T21:58:42Z,,,79,69,"{'bp_id': {0: 112459, 1: 295562, 2: 252548, 3:..."
6493,33551,BILL & MELINDA GATES FOUNDATION,7863433,BILL & MELINDA GATES FOUNDATION,1.0,,,"{'Org': {'name_nick': None, 'name': 'Bill & Me...",,Org,"[Bill & Melinda Gates Foundation, Bill and Mel...",Family foundation of Microsoft founder Bill Gates,"[Organization, Philanthropy]",2020-07-14T02:25:29Z,,,420,19,"{'bp_id': {0: 809124, 1: 516547, 2: 2725019, 3..."


In [None]:
filtered_data['rev'] = filtered_data['ls_id'].appy(reverseMatch)

In [None]:
filtered_data.to_pickle(os.getcwd() + '/Pickle/filtered_org_match_rev.pkl')
print('Done')