In [12]:
import requests
import pandas as pd
import time
from joblib import Parallel, delayed
from tqdm import tqdm
import pandas as pd

In [19]:
#authors_data = pd.read_csv("authors_data1.csv")
authors_data = pd.read_csv("ic2s2_coauthors.csv")
author_id_list = authors_data['id'].tolist()
author_id_list = [url.split('/')[-1] for url in author_id_list]
len(author_id_list)

# --------------
# or use your own author ID list here
# --------------

26887

In [20]:
# Define Concepts
# can find all concept ID's at: https://api.openalex.org/concepts
concepts1 = ['C144024400', 'C15744967', 'C162324750', 'C17744445']
concepts2 = ['C33923547', 'C121332964', 'C41008148']
EMAIL = "s204120@dtu.dk"

# Function to generate OpenAlex API request URL
def URL_filter(author_ids):
    base_url = "https://api.openalex.org/works"
    author_filter = f"author.id:{'|'.join(author_ids)}"
    concepts1_filter = '|'.join(concepts1)
    concepts2_filter = '|'.join(concepts2)
    filters = (
        "?filter="
        f"{author_filter},"
        f"cited_by_count:>10,"
        f"concepts.id:({concepts1_filter}),"
        f"concepts.id:({concepts2_filter})"
    )
    
    return base_url + filters

# Function to fetch works for a given batch of authors
def fetch_author_works(author_ids, max_results=200, cursor="*"):
    all_works = []
    
    while cursor:
        url = URL_filter(author_ids)
        params = {
            'per_page': max_results, 
            'cursor': cursor,  
            'mailto': EMAIL,
        }

        response = requests.get(url, params=params)
        data = response.json()

        if 'results' in data and data['results']:
            all_works.extend(data['results'])
        cursor = data.get('meta', {}).get('next_cursor', None)

    return all_works

# Function to extract relevant details from works
def extract_work_details(works):
    works_data = []
    for work in works:
        work_info = {
            'id': work.get('id', 'N/A'),
            'publication_year': work.get('publication_year', 'N/A'),
            'cited_by_count': work.get('cited_by_count', 0),
            'author_ids': [author['author'].get('id', 'N/A') for author in work.get('authorships', [])],
            'title': work.get('title', 'N/A'),
            'abstract_inverted_index': work.get('abstract_inverted_index', 'N/A'),
        }
        works_data.append(work_info)
    return works_data


def fetch_and_process_batch(chunk):
    works = fetch_author_works(chunk)
    return extract_work_details(works)

#Batch size = how many authors per API request, n_jobs are parallel threads.
def Request_batching(author_ids, batch_size=25, n_jobs=9):
    chunks = [author_ids[i:i + batch_size] for i in range(0, len(author_ids), batch_size)]
    
    all_works_data = []
    with tqdm(total=len(chunks), desc="Fetching batches") as pbar:
        for i in range(0, len(chunks), n_jobs):  # Process in groups of 10
            batch_chunks = chunks[i:i + n_jobs]  # Take up to 10 batches
            results = Parallel(n_jobs=n_jobs)(
                delayed(fetch_and_process_batch)(chunk) for chunk in batch_chunks
            )
            
            for batch in results:
                all_works_data.extend(batch)
            
            pbar.update(len(batch_chunks))  
            time.sleep(1)  

    return all_works_data

all_works_data = Request_batching(author_id_list)

df = pd.DataFrame(all_works_data)


Fetching batches: 100%|██████████| 1076/1076 [23:45<00:00,  1.33s/it]


Save to separate files:

In [21]:
df_papers = pd.DataFrame(all_works_data, columns=['id', 'publication_year', 'cited_by_count', 'author_ids'])
df_abstracts = pd.DataFrame(all_works_data, columns=['id', 'title', 'abstract_inverted_index'])

df_papers.to_csv("ic2s2_papers_coauthors.csv", index=False)
df_abstracts.to_csv("ic2s2_abstracts_coauthors.csv", index=False)

Unique authors ID's (Co-author list)

In [None]:
ls = []
for i in df_papers['author_ids']:
    ls.extend(i)
#Only unique authors:
ls = list(set([url.split('/')[-1] for url in ls]))


['A5083798768',
 'A5062400717',
 'A5107868703',
 'A5014610963',
 'A5051492143',
 'A5111576791',
 'A5087852808',
 'A5078705441',
 'A5088933473',
 'A5022386372',
 'A5064592144',
 'A5107874562',
 'A5081968938',
 'A5017260327',
 'A5089016187',
 'A5010920098',
 'A5058945731',
 'A5108100055',
 'A5032058863',
 'A5085527500',
 'A5024873469',
 'A5100740473',
 'A5084556788',
 'A5107866452',
 'A5109937854',
 'A5106442212',
 'A5073306953',
 'A5040021742',
 'A5064710755',
 'A5049700217',
 'A5068265896',
 'A5035927195',
 'A5108785770',
 'A5039346080',
 'A5051573571',
 'A5101794839',
 'A5043407684',
 'A5079951798',
 'A5018461674',
 'A5037742971',
 'A5088242810',
 'A5024966207',
 'A5036555597',
 'A5062395162',
 'A5078468500',
 'A5101491932',
 'A5017103799',
 'A5081992122',
 'A5044566727',
 'A5065620590',
 'A5001094018',
 'A5046595623',
 'A5113722764',
 'A5008854508',
 'A5036141800',
 'A5036149819',
 'A5025582934',
 'A5016062101',
 'A5066748874',
 'A5064729514',
 'A5109239826',
 'A5018216287',
 'A50298

In [None]:
import requests
import pandas as pd
import time
from joblib import Parallel, delayed
from tqdm import tqdm

# Define Concepts
concepts1 = ['C144024400', 'C15744967', 'C162324750', 'C17744445']
concepts2 = ['C33923547', 'C121332964', 'C41008148']
EMAIL = "s204120@dtu.dk"

def URL_filter_authors(author_ids):
    """Generates OpenAlex API request URL for authors."""
    base_url = "https://api.openalex.org/authors"
    author_filter = f"id:{'|'.join(author_ids)}"
    filters = f"?filter={author_filter}"
    return base_url + filters

#Page limiting and cursor not relevant since we only ever fetch 25 authors.
def fetch_author_details(author_ids, max_results=200):
    """Fetches details of authors using OpenAlex API."""
    all_authors = []
    
    url = URL_filter_authors(author_ids)
    params = {'per_page': max_results, 'mailto': EMAIL}
    response = requests.get(url, params=params)
    data = response.json()
    
    if 'results' in data and data['results']:
        for author in data['results']:
            #Have to check CC beforehand because the list can be missing which breaks indexing
            institutions = author.get('last_known_institutions', [])
            #Empty list = False in python
            country_code = institutions[0].get('country_code', 'N/A') if institutions else 'N/A'
            author_info = {
                'id': author.get('id', 'N/A'),
                'display_name': author.get('display_name', 'N/A'),
                'works_api_url': author.get('works_api_url', 'N/A'),
                'h_index': author.get('summary_stats', {}).get('h_index', 0),
                'works_count': author.get('works_count', 0),
                'country_code': country_code
            }
            all_authors.append(author_info)
    
    return all_authors

def fetch_and_process_authors(chunk):
    """Fetch and process a batch of authors."""
    return fetch_author_details(chunk)

def Request_batching_authors(author_ids, batch_size=25, n_jobs=9):
    """Batch requests for author details."""
    chunks = [author_ids[i:i + batch_size] for i in range(0, len(author_ids), batch_size)]
    all_authors_data = []
    
    with tqdm(total=len(chunks), desc="Fetching author batches") as pbar:
        for i in range(0, len(chunks), n_jobs):
            batch_chunks = chunks[i:i + n_jobs]
            results = Parallel(n_jobs=n_jobs)(
                delayed(fetch_and_process_authors)(chunk) for chunk in batch_chunks
            )
            for batch in results:
                all_authors_data.extend(batch)
            pbar.update(len(batch_chunks))
            time.sleep(1)
    
    return all_authors_data

# Fetch co-author details excluding IC2S2 authors
unique_coauthor_ids = ls[0:30]
coauthor_data = Request_batching_authors(unique_coauthor_ids)

df_coauthors = pd.DataFrame(coauthor_data)
df_coauthors.to_csv("ic2s2_coauthors.csv", index=False)


Fetching author batches: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
