In [181]:
#install openAlex API:
import sys
!{sys.executable} -m pip install openalexapi



In [182]:
#Install packages:
import numpy as np
import matplotlib as plt
import pandas as pd
import requests
import math
import csv

 # Experiment with queries through openAlex

In [381]:
#read keywords dictionary for search query from csv file:
searchwords = {}
ds_reader = csv.DictReader(open('../thesis/keywords.csv'))
for s in ds_reader:
    searchwords[s["Keywords"]] = s["Variations"]

In [382]:
#print the dictionary:
searchwords

{'chest': 'chest radiology, CXR, chest xray',
 'image': 'image',
 'data': 'dataset, database',
 'ML': 'artificial intelligence, machine learning, CNN, deep learning, convolutional neural network, computer-aided diagnosis, classification, classifier, transfer learning, multitask learning, AI'}

In [383]:
#print the keys for searchwords:
for key in searchwords:
    print(key)

chest
image
data
ML


In [384]:
#adjust the query for OpenAlex url:
query = ''

#split all values for each keyword by comma (between the OR words):
for value in searchwords.keys():
    searchwords[value]=searchwords[value].split(',')
    #put quotation marks around the word to be sure it will search on the double words together:
#for w in searchwords[value]:
    #searchwords[value] += f'"{w}"'
    query+=', '.join(f'"{w}"' for w in searchwords[value])

query = query.replace(',', '|')
query = query.replace('""','","')
query = query.replace('| ', '|')
query= query.replace('" ','"')

print(query)

"chest radiology"|"CXR"|"chest xray","image","dataset"|"database","artificial intelligence"|"machine learning"|"CNN"|"deep learning"|"convolutional neural network"|"computer-aided diagnosis"|"classification"|"classifier"|"transfer learning"|"multitask learning"|"AI"


In [385]:
#split the query by the comma such that it is posible to add the parameter for the filtering for each value per key:
query_keywords=query.split(',')

In [386]:
query_keywords

['"chest radiology"|"CXR"|"chest xray"',
 '"image"',
 '"dataset"|"database"',
 '"artificial intelligence"|"machine learning"|"CNN"|"deep learning"|"convolutional neural network"|"computer-aided diagnosis"|"classification"|"classifier"|"transfer learning"|"multitask learning"|"AI"']

In [387]:
#base url with filter parameter:
base_url = "https://api.openalex.org/works?filter="

#parameter to do the search on fulltexts:
param ='fulltext.search:'

#add number of results per page:
max_results_per_page = '&per-page=200'

In [388]:
#add the base url with filter:
url = f'{base_url}'
#add parameter, keywords:
for keywords in query_keywords:
    url+=f'{param}{keywords}'

#add a comma between the keywords:
url=url.replace(f'"{param}', f'",{param}')

#add max number of results per page to the url:
url = url+max_results_per_page
print(url)

https://api.openalex.org/works?filter=fulltext.search:"chest radiology"|"CXR"|"chest xray",fulltext.search:"image",fulltext.search:"dataset"|"database",fulltext.search:"artificial intelligence"|"machine learning"|"CNN"|"deep learning"|"convolutional neural network"|"computer-aided diagnosis"|"classification"|"classifier"|"transfer learning"|"multitask learning"|"AI"&per-page=200


In [389]:
#define a function that return the meta data on the url e.g., number of search results:
def meta (url):
    r = requests.get(url)
    meta = r.json()['meta']
    return meta

In [390]:
#define a function that return the actual results:
def results(url):
    results= requests.get(url).json()['results']
    return results

In [391]:
#get the meta data from search queries:
meta = meta(url)

In [377]:
meta

{'count': 1452, 'db_response_time_ms': 280, 'page': 1, 'per_page': 200}

In [393]:
url

'https://api.openalex.org/works?filter=fulltext.search:"chest radiology"|"CXR"|"chest xray",fulltext.search:"image",fulltext.search:"dataset"|"database",fulltext.search:"artificial intelligence"|"machine learning"|"CNN"|"deep learning"|"convolutional neural network"|"computer-aided diagnosis"|"classification"|"classifier"|"transfer learning"|"multitask learning"|"AI"&per-page=200'

In [399]:
#count number of pages (rounding up):
num_pages = math.ceil(meta['count']/meta['per_page'])

#make a list in which the results from all pages can be added:
result = []

for page in range(1,num_pages+1): #loop that add the results from all available pages to the list.
    url_new=f'{url}&page={page}'
    result+=results(url_new)
    page=+1
    print(url_new) #print the url to check if the url is correct. 

https://api.openalex.org/works?filter=fulltext.search:"chest radiology"|"CXR"|"chest xray",fulltext.search:"image",fulltext.search:"dataset"|"database",fulltext.search:"artificial intelligence"|"machine learning"|"CNN"|"deep learning"|"convolutional neural network"|"computer-aided diagnosis"|"classification"|"classifier"|"transfer learning"|"multitask learning"|"AI"&per-page=200&page=1
https://api.openalex.org/works?filter=fulltext.search:"chest radiology"|"CXR"|"chest xray",fulltext.search:"image",fulltext.search:"dataset"|"database",fulltext.search:"artificial intelligence"|"machine learning"|"CNN"|"deep learning"|"convolutional neural network"|"computer-aided diagnosis"|"classification"|"classifier"|"transfer learning"|"multitask learning"|"AI"&per-page=200&page=2
https://api.openalex.org/works?filter=fulltext.search:"chest radiology"|"CXR"|"chest xray",fulltext.search:"image",fulltext.search:"dataset"|"database",fulltext.search:"artificial intelligence"|"machine learning"|"CNN"|"de

In [400]:
#print the length of results. Should be equal to meta.count:
len(result)

1452

In [402]:
#normalize the results
result = pd.json_normalize(result)

In [403]:
result

Unnamed: 0,id,doi,title,display_name,relevance_score,publication_year,publication_date,type,authorships,cited_by_count,...,abstract_inverted_index.ward.,abstract_inverted_index.(65%).,abstract_inverted_index.(40.9%,"abstract_inverted_index.ward),",abstract_inverted_index.Salbutamol,abstract_inverted_index.199).,abstract_inverted_index.Amongst,abstract_inverted_index.meaningfully,abstract_inverted_index.advice.,abstract_inverted_index.LVC.
0,https://openalex.org/W3101156210,https://doi.org/10.1109/cvpr.2017.369,ChestX-Ray8: Hospital-Scale Chest X-Ray Databa...,ChestX-Ray8: Hospital-Scale Chest X-Ray Databa...,2489.0654,2017,2017-07-21,proceedings-article,"[{'author_position': 'first', 'author': {'id':...",1857,...,,,,,,,,,,
1,https://openalex.org/W2731899572,https://doi.org/10.1007/s12194-017-0406-5,Overview of deep learning in medical imaging,Overview of deep learning in medical imaging,1636.1796,2017,2017-07-08,journal-article,"[{'author_position': 'first', 'author': {'id':...",473,...,,,,,,,,,,
2,https://openalex.org/W2888397986,https://doi.org/10.1186/s12938-018-0544-y,Computer-aided detection in chest radiography ...,Computer-aided detection in chest radiography ...,1134.2656,2018,2018-08-22,journal-article,"[{'author_position': 'first', 'author': {'id':...",187,...,,,,,,,,,,
3,https://openalex.org/W2150835094,https://doi.org/10.1007/s00134-012-2513-4,International evidence-based recommendations f...,International evidence-based recommendations f...,1010.3938,2012,2012-03-06,journal-article,"[{'author_position': 'first', 'author': {'id':...",1860,...,,,,,,,,,,
4,https://openalex.org/W2963466845,https://doi.org/10.1609/aaai.v33i01.3301590,CheXpert: A Large Chest Radiograph Dataset wit...,CheXpert: A Large Chest Radiograph Dataset wit...,952.6728,2019,2019-01-21,journal-article,"[{'author_position': 'first', 'author': {'id':...",821,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1447,https://openalex.org/W2953697562,https://doi.org/10.1002/ajum.12161,A 9‐year audit of fetal chest masses in an Aus...,A 9‐year audit of fetal chest masses in an Aus...,,2019,2019-06-27,journal-article,"[{'author_position': 'first', 'author': {'id':...",0,...,,,,,,,,,,
1448,https://openalex.org/W2963716134,https://doi.org/10.1017/9781107587908.026,Pulmonary Consult: Management of Severe Hypoxi...,Pulmonary Consult: Management of Severe Hypoxi...,,2019,2019-07-01,book-chapter,"[{'author_position': 'first', 'author': {'id':...",0,...,,,,,,,,,,
1449,https://openalex.org/W4210948731,https://doi.org/10.1016/b978-0-323-60987-6.000...,Acute Heart Failure,Acute Heart Failure,,2020,2020-01-01,book-chapter,"[{'author_position': 'first', 'author': {'id':...",0,...,,,,,,,,,,
1450,https://openalex.org/W2970652729,https://doi.org/10.1111/jpc.14602,Bronchiolitis at a specialist paediatric centr...,Bronchiolitis at a specialist paediatric centr...,,2020,2020-02-01,journal-article,"[{'author_position': 'first', 'author': {'id':...",0,...,[97],[140],[151],[157],[166],[173],[174],[211],[240],[252]


In [404]:
#save results to csv:
result.to_csv('results.csv')