In [128]:
import pandas as pd
import os
import json
from tqdm.auto import tqdm
import networkx as nx
import semanticscholar as sch
import urllib
import json
import requests
from fuzzywuzzy import fuzz
import statistics
from networkx.algorithms.dag import descendants
import matplotlib.pyplot as plt

In [4]:
cur_path = os.getcwd()

file_path_iclr = os.path.relpath('..\\data\\paper_data_mag\\MAG_ICLR.json', cur_path)
file_path_acl = os.path.relpath('..\\data\\paper_data_mag\\MAG_ACL.json', cur_path)
file_path_cvpr = os.path.relpath('..\\data\\paper_data_mag\\MAG_CVPR.json', cur_path)
file_path_emnlp = os.path.relpath('..\\data\\paper_data_mag\\MAG_EMNLP.json', cur_path)
file_path_iccv = os.path.relpath('..\\data\\paper_data_mag\\MAG_ICCV.json', cur_path)
file_path_iclr = os.path.relpath('..\\data\\paper_data_mag\\MAG_ICLR.json', cur_path)
file_path_icml = os.path.relpath('..\\data\\paper_data_mag\\MAG_ICML.json', cur_path)
file_path_naacl = os.path.relpath('..\\data\\paper_data_mag\\MAG_NAACL.json', cur_path)
file_path_neurips = os.path.relpath('..\\data\\paper_data_mag\\MAG_NEURIPS.json', cur_path)

In [5]:
def create_paper_data_df(file_path):
    with open(file_path) as f_in:
        paper_data_dict = json.load(f_in)
    paper_data_records = paper_data_dict['entities']
    paper_data_df = pd.DataFrame.from_records(paper_data_records)
    return paper_data_df

def fuzzy_matching(x,y):
    return True if fuzz.partial_ratio(x,y) > 90 else False

In [None]:
iclr_paper_data = create_paper_data_df(file_path_iclr)
references_list_iclr = [(rid,row['Id']) for idx,row in tqdm(iclr_paper_data.iterrows()) if type(row['RId'])!=float for rid in row['RId']]

acl_paper_data = create_paper_data_df(file_path_acl)
references_list_acl = [(rid,row['Id']) for idx,row in tqdm(acl_paper_data.iterrows()) if type(row['RId'])!=float for rid in row['RId']]

cvpr_paper_data = create_paper_data_df(file_path_cvpr)
references_list_cvpr = [(rid,row['Id']) for idx,row in tqdm(cvpr_paper_data.iterrows()) if type(row['RId'])!=float for rid in row['RId']]

emnlp_paper_data = create_paper_data_df(file_path_emnlp)
references_list_emnlp = [(rid,row['Id']) for idx,row in tqdm(emnlp_paper_data.iterrows()) if type(row['RId'])!=float for rid in row['RId']]

iccv_paper_data = create_paper_data_df(file_path_iccv)
references_list_iccv = [(rid,row['Id']) for idx,row in tqdm(iccv_paper_data.iterrows()) if type(row['RId'])!=float for rid in row['RId']]

icml_paper_data = create_paper_data_df(file_path_icml)
references_list_icml = [(rid,row['Id']) for idx,row in tqdm(icml_paper_data.iterrows()) if type(row['RId'])!=float for rid in row['RId']]

naacl_paper_data = create_paper_data_df(file_path_naacl)
references_list_naacl = [(rid,row['Id']) for idx,row in tqdm(naacl_paper_data.iterrows()) if type(row['RId'])!=float for rid in row['RId']]

neurips_paper_data = create_paper_data_df(file_path_neurips)
references_list_neurips = [(rid,row['Id']) for idx,row in tqdm(neurips_paper_data.iterrows()) if type(row['RId'])!=float for rid in row['RId']]


In [23]:
all_paper_data_df = pd.concat([iclr_paper_data, acl_paper_data, cvpr_paper_data, emnlp_paper_data, iccv_paper_data, icml_paper_data, naacl_paper_data, neurips_paper_data])
all_paper_data_df.to_pickle("..\\data\\all_paper_data.pkl")

In [11]:
all_paper_data_df = pd.read_pickle("..\\data\\all_paper_data.pkl")


In [None]:
aw = all_paper_data_df.set_index('Id')['AW']
f = all_paper_data_df.set_index('Id')['F']

In [25]:
reference_df_iclr = pd.DataFrame(references_list_iclr, columns=['this_paper_infected','got_infected_by'])
reference_df_acl = pd.DataFrame(references_list_acl, columns=['this_paper_infected','got_infected_by'])
reference_df_cvpr = pd.DataFrame(references_list_cvpr, columns=['this_paper_infected','got_infected_by'])
reference_df_emnlp = pd.DataFrame(references_list_emnlp, columns=['this_paper_infected','got_infected_by'])
reference_df_iccv = pd.DataFrame(references_list_iccv, columns=['this_paper_infected','got_infected_by'])
reference_df_icml = pd.DataFrame(references_list_icml, columns=['this_paper_infected','got_infected_by'])
reference_df_naacl = pd.DataFrame(references_list_naacl, columns=['this_paper_infected','got_infected_by'])
reference_df_neurips = pd.DataFrame(references_list_neurips, columns=['this_paper_infected','got_infected_by'])

reference_df = pd.concat([reference_df_iclr, reference_df_acl, reference_df_cvpr, reference_df_emnlp, reference_df_iccv, reference_df_icml, reference_df_naacl, reference_df_neurips])
reference_df.to_pickle("..\\data\\networks\\citation_network.pkl")

In [9]:
reference_df = pd.read_pickle('..\\data\\networks\\citation_network.pkl')
reference_df

Unnamed: 0,this_paper_infected,got_infected_by
0,2963403868,2785994986
1,2963207607,2785994986
2,2134557905,2785994986
3,2154579312,2785994986
4,2966661,2785994986
...,...,...
2191,2482888308,2897127218
2192,2057624533,2897127218
2193,2103458172,2897127218
2194,2136885855,2897127218


In [12]:
valid_idx_infection = []
all_paper_ids = set(all_paper_data_df['Id'].values)
aw = all_paper_data_df.set_index('Id')['AW']
f = all_paper_data_df.set_index('Id')['F']
# print(all_paper_ids)
for idx,row in tqdm(reference_df.iterrows()):
    
    source = row['this_paper_infected']
    destination = row['got_infected_by']
    if source in all_paper_ids and destination in all_paper_ids:
        source_abstract = aw.at[source]
        dest_abstract = aw.at[destination]
        source_fos = f.at[source]
        dest_fos = f.at[destination]
        
        # Make sure these fields aren't NaNs
        if type(source_abstract) != float and type(dest_abstract) != float and type(source_fos) != float and type(dest_fos) != float:
            #abstract overlap
            source_abstract = set(source_abstract)
            dest_abstract = set(dest_abstract)
            abstract_overlap_score = len(set.intersection(source_abstract,dest_abstract))/min(len(source_abstract), len(dest_abstract))
            #FOS overlap
            source_fos = set([pair['FN'] for pair in source_fos])
            dest_fos = set([pair['FN'] for pair in dest_fos])
            fos_overlap_score = len(set.intersection(source_fos,dest_fos))/min(len(source_fos), len(dest_fos))
            if abstract_overlap_score > 0.1 and fos_overlap_score > 0.1:
                valid_idx_infection.append(idx)


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [None]:
infected_df = reference_df.iloc[valid_idx_infection, :]
infected_df.to_pickle("..\\data\\networks\\citation_infection_network.pkl")