In [1]:
import os
import time
import json
import numpy as np
import requests as req
from tqdm import tqdm
from IPython.display import clear_output
from concurrent.futures import ThreadPoolExecutor

In [2]:
def make_base_url(paper_id:str) -> str:
    return f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations'

In [3]:
API_KEY = 'qZWKkOKyzP5g9fgjyMmBt1MN2NTC6aT61UklAiyw'

headers = {'x-api-key': API_KEY}
params = {
    'fields': 'paperId',
}

In [5]:
def make_request(paper_id:str):
    r = req.get(make_base_url(paper_id), headers=headers, params=params)
    time.sleep(0.1)
    resp = json.loads(r.content)
    if 'data' in resp.keys():
        return {paper_id: resp['data']}
    return {paper_id: []}

In [6]:
with open('../data/cleaned_final.json', 'r') as f:
    ids = json.load(f)

In [7]:
ids = list(map(lambda x: x['paperId'], ids))

In [7]:
# ids = np.load('unique_ids_list.npy', allow_pickle=True)

In [8]:
executor = ThreadPoolExecutor(max_workers=os.cpu_count() + 4)

In [9]:
with executor as pool:
    additional_data = list(tqdm(pool.map(make_request, ids), total=len(ids)))

100%|██████████| 12927/12927 [04:46<00:00, 45.05it/s] 


In [11]:
for entry in additional_data:
    key = list(entry.keys())[0]
    cits = []
    if len(entry[key]) != 0:
        for cit in entry[key]:
            cits.append(cit['citingPaper']['paperId'])
    entry[key] = cits

In [13]:
id_keys = []
for entry in additional_data:
    key = list(entry.keys())[0]
    id_keys.append(key)

id_keys = list(set(id_keys))

In [16]:
for entry in additional_data:
    key = list(entry.keys())[0]
    new = []
    for cit in entry[key]:
        if cit in id_keys:
            new.append(cit)
    entry[key] = new

In [18]:
d = {}
for entry in additional_data:
    key = list(entry.keys())[0]
    d[key] = entry[key]

In [20]:
with open('../data/citations_relations.json', 'w') as f:
    json.dump(d, f)

In [8]:
faulty = []
indexes = []
for n, sample in enumerate(additional_data):
    if 'error' in sample.keys():
        faulty.append(sample['error'].split()[3])
        indexes.append(n)

In [47]:
indexes = []
for n, ent in enumerate(additional_data):
    if ent['paperId'] in faulty:
        indexes.append(n)

In [31]:
faulty = faulty + absless

In [32]:
faulty

['fee07fc3047af2af4b69a0989163873b39ea6cbf',
 '98cf7c59fb08827f5677c43637c42c5abc36e807',
 'None',
 'ff19d2d9980b47974fa2ad2d7eef154b9636fb44',
 'a747fc1205ddd2519beb0825e28d547ca2c7c366',
 '14c33602a4730c92b2cd794e2725e3743653af08']

In [48]:
additional_data = np.delete(np.array(additional_data), indexes)

In [33]:
ids = list(ids)

In [35]:
for f_id in faulty:
    if f_id != 'None' and f_id in ids:
        ids.pop(ids.index(f_id))

In [18]:
cits = np.load('../data/citation_relations.npy', allow_pickle=True)

In [19]:
len(cits)

9128

In [36]:
indexes = []
for n, ent in enumerate(cits):
    key = list(ent.keys())[0]
    if key in faulty:
        indexes.append(n)

In [37]:
cits = np.delete(np.array(cits), indexes)

In [38]:
cits = list(cits)

In [39]:
for ent in cits:
    key = list(ent.keys())[0]
    tmp = []
    refs = ent[key]
    for ref in refs:
        if ref not in faulty:
            tmp.append(ref)
    ent[key] = tmp

In [40]:
len(cits)

9127

In [49]:
quick_lookup = {}
for n, data in enumerate(additional_data):
    try:
        quick_lookup[data['paperId']] = {
            'title': data['title'],
            'abstract': data['abstract'],
            'year': data['year'],
            'fieldsOfStudy': data['fieldsOfStudy'],
            'authors': data['authors'],
            'tldr': data['tldr']
        }
    except Exception as e:
        print(n, e)

In [50]:
keys = list(quick_lookup.keys())

In [51]:
absless = []
for key in keys:
    d = quick_lookup[key]
    if d['abstract'] is None:
        absless.append(key)

In [52]:
absless

[]

In [53]:
np.save('../data/citation_relations.npy', np.array(cits))

In [54]:
np.save('../data/additional_info_dict.npy', quick_lookup)