In [2]:
import requests
import pandas as pd
import time
from joblib import Parallel, delayed
from tqdm import tqdm
import pandas as pd

In [None]:
authors_data = pd.read_csv("authors_data1.csv")
author_id_list = authors_data['id'].tolist()
author_id_list = [url.split('/')[-1] for url in author_id_list]
author_id_list

# --------------
# or use your own author ID list here
# --------------

['A5009943357',
 'A5063445465',
 'A5102918288',
 'A5114789103',
 'A5021346979',
 'A5010846099',
 'A5056423601',
 'A5064735018',
 'A5027079032',
 'A5010053907',
 'A5076316802',
 'A5053945884',
 'A5020533147',
 'A5063436262',
 'A5025069859',
 'A5038833789',
 'A5019426968',
 'A5007136526',
 'A5092160841',
 'A5108156362',
 'A5059787882',
 'A5100652499',
 'A5049483871',
 'A5002658913',
 'A5042359253',
 'A5083467883',
 'A5002073039',
 'A5065295188',
 'A5048176602',
 'A5034728614',
 'A5014435955',
 'A5036794290',
 'A5033847936',
 'A5072392110',
 'A5027145422',
 'A5045451254',
 'A5100355277',
 'A5039940068',
 'A5101854927',
 'A5060834533',
 'A5102415619',
 'A5024623417',
 'A5100442044',
 'A5063166187',
 'A5074624254',
 'A5082558628',
 'A5038897083',
 'A5070656514',
 'A5034468528',
 'A5087725561',
 'A5026854954',
 'A5075447958',
 'A5110124902',
 'A5083277795',
 'A5033090175',
 'A5100377930',
 'A5100397455',
 'A5049723093',
 'A5002034958',
 'A5087900468',
 'A5056583184',
 'A5073754797',
 'A50558

In [None]:
# Define Concepts
# can find all concept ID's at: https://api.openalex.org/concepts
concepts1 = ['C144024400', 'C15744967', 'C162324750', 'C17744445']
concepts2 = ['C33923547', 'C121332964', 'C41008148']
EMAIL = "s204120@dtu.dk"

# Function to generate OpenAlex API request URL
def URL_filter(author_ids):
    base_url = "https://api.openalex.org/works"
    author_filter = f"author.id:{'|'.join(author_ids)}"
    concepts1_filter = '|'.join(concepts1)
    concepts2_filter = '|'.join(concepts2)
    filters = (
        "?filter="
        f"{author_filter},"
        f"cited_by_count:>10,"
        f"concepts.id:({concepts1_filter}),"
        f"concepts.id:({concepts2_filter})"
    )
    
    return base_url + filters

# Function to fetch works for a given batch of authors
def fetch_author_works(author_ids, max_results=200, cursor="*"):
    all_works = []
    
    while cursor:
        url = URL_filter(author_ids)
        params = {
            'per_page': max_results, 
            'cursor': cursor,  
            'mailto': EMAIL,
        }

        response = requests.get(url, params=params)
        data = response.json()

        if 'results' in data and data['results']:
            all_works.extend(data['results'])
        cursor = data.get('meta', {}).get('next_cursor', None)

    return all_works

# Function to extract relevant details from works
def extract_work_details(works):
    works_data = []
    for work in works:
        work_info = {
            'id': work.get('id', 'N/A'),
            'publication_year': work.get('publication_year', 'N/A'),
            'cited_by_count': work.get('cited_by_count', 0),
            'author_ids': [author['author'].get('id', 'N/A') for author in work.get('authorships', [])],
            'title': work.get('title', 'N/A'),
            'abstract_inverted_index': work.get('abstract_inverted_index', 'N/A'),
        }
        works_data.append(work_info)
    return works_data


def fetch_and_process_batch(chunk):
    works = fetch_author_works(chunk)
    return extract_work_details(works)

#Batch size = how many authors per API request, n_jobs are parallel threads.
def Request_batching(author_ids, batch_size=25, n_jobs=9):
    chunks = [author_ids[i:i + batch_size] for i in range(0, len(author_ids), batch_size)]
    
    all_works_data = []
    with tqdm(total=len(chunks), desc="Fetching batches") as pbar:
        for i in range(0, len(chunks), n_jobs):  # Process in groups of 10
            batch_chunks = chunks[i:i + n_jobs]  # Take up to 10 batches
            results = Parallel(n_jobs=n_jobs)(
                delayed(fetch_and_process_batch)(chunk) for chunk in batch_chunks
            )
            
            for batch in results:
                all_works_data.extend(batch)
            
            pbar.update(len(batch_chunks))  
            time.sleep(1)  

    return all_works_data

all_works_data = Request_batching(author_id_list)

df = pd.DataFrame(all_works_data)
