In [8]:
import urllib.request
import urllib.parse
import json
import pprint

class CoreApiRequestor:

    def __init__(self, endpoint, api_key):
        self.endpoint = endpoint
        self.api_key = api_key
        #defaults
        self.pagesize = 100
        self.page = 1

    def parse_response(self, decoded):
        res = []
        for item in decoded['data']:
            doi = None
            if 'identifiers' in item:
                for identifier in item['identifiers']:
                    if identifier and identifier.startswith('doi:'):
                        doi = identifier
                        break
            res.append([item['title'], doi])
        return res

    def request_url(self, url):
        with urllib.request.urlopen(url) as response:
            html = response.read()
        return html

    def get_method_query_request_url(self,method,query,fullText,page):
        if (fullText):
            fullText = 'true'
        else:
            fullText = 'false'
        params = {
            'apiKey':self.api_key,
            'page':page,
            'pageSize':self.pagesize,
            'fulltext':fullText
        }
        return self.endpoint + method + '/' + urllib.parse.quote(query) + '?' + urllib.parse.urlencode(params)

    def get_up_to_20_pages_of_query(self,method,query,fulltext):
        url = self.get_method_query_request_url(method,query,fulltext,1)
        all_articles=[]
        resp = self.request_url(url)
        result = json.loads(resp.decode('utf-8'))
        all_articles.append(result)
        if (result['totalHits']>100):
            numOfPages = int(result['totalHits']/self.pagesize)  #rounds down
            if (numOfPages>20):
                numOfPages=20
            for i in range(2,numOfPages):
                url = self.get_method_query_request_url(method,query,False,i)
                print(url)
                resp =self.request_url(url)
                all_articles.append(json.loads(resp.decode('utf-8')))
        return all_articles


In [10]:
import os
import pickle
import pandas as pd

Dir_Path = "/Users/pranjali/Downloads/Wiki_BiasDetection/Data/Technical_Papers/Papers_Data/"
endpoint = 'https://core.ac.uk/api-v2'
method = '/articles/search'
api_key = 'zXFb53YjLQ8f0Kw7V6H4nqMPtpcgyuNC'
api = CoreApiRequestor(endpoint,api_key)

topics = ['blockchain', 'deep AND learning', 'quantum AND physics', 'inorganic AND chemistry', 'psychology', 'astrophysics',
         'group AND theory', 'molecular AND networks', 'computation AND language', 'economics', 'signal AND processing']
FullText = False

if not os.path.isdir(Dir_Path):
    os.mkdir(Dir_Path)

for topic in topics:

    url = api.get_method_query_request_url(method,topic,FullText,1)
    # print("url: ", url)

    result = api.request_url(url)
    result_str = result.decode('utf8')
    data = json.loads(result_str)
     
    with open(Dir_Path + topic + '.json', 'w') as file:
        json.dump(data, file)
        
    columns = ['id', 'publisher', 'description']
    topic_papers = []
#     topic_papers.append(columns)
    
    for paper in data['data']:
        
        paper_row = []
        
        if 'id' in paper.keys():
            paper_row.append(paper['id'])
            
            if ('publisher' in paper.keys()):
                paper_row.append(paper['publisher'])
            else:
                paper_row.append("NA")

            if ('description' in paper.keys()):
                paper_row.append(paper['description'])
            else:
                paper_row.append("NA")

            topic_papers.append(paper_row)
        
        df = pd.DataFrame(topic_papers)
        
        df.to_csv(Dir_Path + topic + '.csv', header=columns)
        