In [1]:
import os
import time
import json
import numpy as np
import requests as req
from tqdm import tqdm
from IPython.display import clear_output
from concurrent.futures import ThreadPoolExecutor

In [2]:
def make_cit_url(paper_id:str) -> str:
    return f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations' 

def make_base_url(paper_id:str) -> str:
    return f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}'

In [3]:
API_KEY = 'qZWKkOKyzP5g9fgjyMmBt1MN2NTC6aT61UklAiyw'
BERT_PAPER_ID = 'df2b0e26d0599ce3e70df8a9da02e51594e0e992'

headers = {'x-api-key': API_KEY}
params = {
    'fields': 'abstract,url,title,citationCount',
}

In [4]:
r = req.get(make_base_url(BERT_PAPER_ID), headers=headers, params=params)

In [5]:
original = json.loads(r.content)

In [6]:
original

{'paperId': 'df2b0e26d0599ce3e70df8a9da02e51594e0e992',
 'url': 'https://www.semanticscholar.org/paper/df2b0e26d0599ce3e70df8a9da02e51594e0e992',
 'title': 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding',
 'abstract': 'We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results 

In [7]:
def download_direct_citations(paper_id:str):
    try:
        params = {
            'fields': 'abstract,url,title,citationCount',
        }
        params['offset'] = '0'
        r = req.get(make_base_url(paper_id), headers=headers, params=params)
        cit_count = json.loads(r.content)['citationCount']
        max_cit_count = cit_count
        chunk_size = 0
        if cit_count > 9999: max_cit_count = 9999
        if max_cit_count > 1000:
            chunk_size = 1000
        else:
            chunk_size = max_cit_count

        data = []
        params['limit'] = str(chunk_size)
        while len(data) < max_cit_count:
            diff = max_cit_count - len(data)
            if diff < int(params['limit']):
                params['limit'] = str(diff)
            time.sleep(0.05)
            r = req.get(make_cit_url(paper_id), headers=headers, params=params)
            j = json.loads(r.content)
            data.extend(j['data'])
            if 'next' in j.keys(): 
                params['offset'] = str(j['next'])
            # clear_output(wait=True)
            # print(len(data))
        return {paper_id: data}
    except Exception as e:
        print(e)

In [8]:
data = download_direct_citations(BERT_PAPER_ID)

In [13]:
data[BERT_PAPER_ID][0]

{'citingPaper': {'paperId': '000aa5d7c23a43503f2de2b188f99965808b0f56',
  'url': 'https://www.semanticscholar.org/paper/000aa5d7c23a43503f2de2b188f99965808b0f56',
  'title': 'RuBERT Embeddings in the Task of Classifying User Posts on a Social Media',
  'abstract': 'This paper presents models for solving the problem of multiclass classification of user posts in a social media. These models are based on embeddings extracted from messages using the RuBert language model and a fully connected neural network built over it. The models presented are compared to a baseline model using long-term short-term memory neurons (LSTM). The results will improve the accuracy of the classification posts, which in turn will improve the accuracy of assessing the psychological characteristics users.',
  'citationCount': 0}}

In [14]:
np.save('../data/direct_citations.npy', data)

In [14]:
citations_paper_ids = list(set([cit['citingPaper']['paperId'] for cit in data[BERT_PAPER_ID]]))

In [17]:
executor = ThreadPoolExecutor(max_workers=os.cpu_count() + 4)

In [18]:
with executor as pool:
    data_indirect = list(tqdm(pool.map(download_direct_citations, citations_paper_ids), total=len(citations_paper_ids)))

 14%|█▍        | 1447/9994 [00:14<01:06, 128.00it/s]

'citationCount'


 45%|████▌     | 4521/9994 [00:57<01:37, 55.89it/s] 

'citationCount'


100%|██████████| 9994/9994 [02:18<00:00, 72.14it/s] 


In [21]:
data_indirect[3]

{'39528ef1de5a6c1b4fba44071591e9f12167769c': [{'citingPaper': {'paperId': '97af09b1436e768019aed4023cd1f9e3ccb9a635',
    'url': 'https://www.semanticscholar.org/paper/97af09b1436e768019aed4023cd1f9e3ccb9a635',
    'title': 'Medical Visual Question Answering: A Survey',
    'abstract': 'aFaculty of Engineering, Monash University, Clayton, VIC, 3800 Australia beResearch Center, Monash University, Clayton, VIC, 3800 Australia cNVIDIA AI Technology Center, 038988, Singapore dState Key Laboratory of Ophthalmology, Zhongshan Ophthalmic Center, Sun Yat-Sen University, Guangzhou, 510060 China eFaculty of Information Technology, Monash University, Clayton, 3800, VIC, Australia fAustralian Centre for Robotic Vision, The University of Adelaide, Adelaide, SA 5005, Australia gEye Research Australia, Royal Victorian Eye and Ear Hospital, East Melbourne, VIC, 3002 Australia hAirdoc Research, Melbourne, VIC, 3000 Australia iMonash-NVIDIA AI Tech Centre, Melbourne, VIC, 3000 Australia',
    'citationC

In [22]:
np.save('../data/data_indirect.npy', data_indirect)