In [None]:
from bs4 import BeautifulSoup 
import pandas as pd
import numpy as np
import requests
import time
from joblib import Parallel, delayed
from tqdm import tqdm
import re

# Retrieving data:

In [None]:
BASE_URL = "https://api.openalex.org/authors"
email = "s204120@dtu.dk"

#Only used for Week 2 name searching so code is relatively slow on purpose to avoid API errors
def get_author_data(name):
    try:
        response = requests.get(BASE_URL, params={'search': name, 'mailto': email}, timeout=10)
        response.raise_for_status()
        data = response.json()

        if 'results' in data and data['results']:
            author = data['results'][0]  # Take the first result
            institutions = author.get('last_known_institutions', [])
            country_code = institutions[0].get('country_code', 'N/A') if institutions else 'N/A'

            return {
                'id': author.get('id', 'N/A'),
                'display_name': author.get('display_name', 'N/A'),
                'works_api_url': author.get('works_api_url', 'N/A'),
                'h_index': author.get('summary_stats', {}).get('h_index', 0),
                'works_count': author.get('works_count', 0),
                'country_code': country_code
            }
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {name}: {e}")
    
    return None  # Return None if an error occurs
#10 Authors and parallel jobs (theoretical max: 25 author, 10 jobs /s)
def process_in_batches(author_names, batch_size=10, delay=1.0):
    authors_data = []
    for i in tqdm(range(0, len(author_names), batch_size), desc="Fetching author data"):
        batch = author_names[i:i + batch_size]  # Take a batch of names
        results = Parallel(n_jobs=batch_size)(delayed(get_author_data)(name) for name in batch)
        authors_data.extend([author for author in results if author])
        time.sleep(delay)  # enforcing 10 API calls /s max

    return authors_data
    
#Generate API info on author_names_2024:
authors_data = process_in_batches(author_names_2024, batch_size=10, delay=1.0)

# Saving the output and filtering the data:

In [None]:
df = pd.DataFrame(authors_data)
#Remove rows with works outside of range [5,5000]
df_filtered = df[(df['works_count'] >= 5) & (df['works_count'] <= 5000)]
df_filtered.to_csv('author_names_plain.csv', index=False)

In [None]:
#Generate list of author ID's for use in the IC2S2 datasets, API calls:
IC2S2_author_IDs = [ids[-11:] for ids in pd.read_csv("author_names_plain.csv").id.to_numpy()]

# Fetching author works from author ID's:

In [None]:
# Define Concepts
# can find all concept ID's at: https://api.openalex.org/concepts
concepts1 = ['C144024400', 'C15744967', 'C162324750', 'C17744445']
concepts2 = ['C33923547', 'C121332964', 'C41008148']
EMAIL = "s204120@dtu.dk"

# Function to generate OpenAlex API request URL
def URL_filter(author_ids):
    base_url = "https://api.openalex.org/works"
    author_filter = f"author.id:{'|'.join(author_ids)}"
    concepts1_filter = '|'.join(concepts1)
    concepts2_filter = '|'.join(concepts2)
    filters = (
        "?filter="
        f"{author_filter},"
        f"cited_by_count:>10,"
        f"concepts.id:({concepts1_filter}),"
        f"concepts.id:({concepts2_filter})"
    )
    
    return base_url + filters

# Function to fetch works for a given batch of authors
def fetch_author_works(author_ids, max_results=200, cursor="*"):
    all_works = []
    
    while cursor:
        url = URL_filter(author_ids)
        params = {
            'per_page': max_results, 
            'cursor': cursor,  
            'mailto': EMAIL,
        }

        response = requests.get(url, params=params)
        data = response.json()

        if 'results' in data and data['results']:
            all_works.extend(data['results'])
        cursor = data.get('meta', {}).get('next_cursor', None)

    return all_works

# Function to extract relevant details from works
def extract_work_details(works):
    works_data = []
    for work in works:
        work_info = {
            'id': work.get('id', 'N/A'),
            'publication_year': work.get('publication_year', 'N/A'),
            'cited_by_count': work.get('cited_by_count', 0),
            'author_ids': [author['author'].get('id', 'N/A') for author in work.get('authorships', [])],
            'title': work.get('title', 'N/A'),
            'abstract_inverted_index': work.get('abstract_inverted_index', 'N/A'),
        }
        works_data.append(work_info)
    return works_data


def fetch_and_process_batch(chunk):
    works = fetch_author_works(chunk)
    return extract_work_details(works)

#Batch size = how many authors per API request, n_jobs are parallel threads.
def Request_batching(author_ids, batch_size=25, n_jobs=9):
    chunks = [author_ids[i:i + batch_size] for i in range(0, len(author_ids), batch_size)]
    
    all_works_data = []
    with tqdm(total=len(chunks), desc="Fetching batches") as pbar:
        for i in range(0, len(chunks), n_jobs):  # Process in groups of 10
            batch_chunks = chunks[i:i + n_jobs]  # Take up to 10 batches
            results = Parallel(n_jobs=n_jobs)(
                delayed(fetch_and_process_batch)(chunk) for chunk in batch_chunks
            )
            
            for batch in results:
                all_works_data.extend(batch)
            
            pbar.update(len(batch_chunks))  
            time.sleep(1)  

    return all_works_data
    
#Parent function to call the API using the ID list:
all_works_data = Request_batching(IC2S2_author_IDs)

df = pd.DataFrame(all_works_data)

# Separating the into the two datasets:

In [None]:
df_papers = pd.DataFrame(all_works_data, columns=['id', 'publication_year', 'cited_by_count', 'author_ids'])
df_abstracts = pd.DataFrame(all_works_data, columns=['id', 'title', 'abstract_inverted_index'])
df_papers.to_csv("IC2S2_Papers.csv", index=False)
df_abstracts.to_csv("IC2S2_Abstracts.csv", index=False)

The dataset constitutes: 13806 works, 21160 unique authors.

In order to speed up the code several techniques were employed. We made sure to have the maximum allowed 25 authors in each API call by using the '|' notation in the URL. This was achieved by creating a batcher function that split the list of author ID's. 

We used the joblib to parralelize the API, using 9 calls at a time with 25 authors each. We erred on the side of caution here to avoid API errors, as well as making sure to implement a one second sleep timer between calls to avoid rate limiting. 

Another efficiency increase comes from including the concept filtering and inequalities directly in the filter with the API call

Coarse concept definitions gives us a large range of potential works, that's still focussed. This is probably the variable that would change the potential scope the most.

The work range makes sure we don't get authors with very little input, or ones where SoSci themes aren't their main research subject. In other words, giving us more relevant authors. The 5000 cap makes sure we don't balloon the dataset if there are anomalies in the API with extreme amounts of attatched works. 

Since the concepts are so coarse though, we could risk getting a dataset that is very broad in nature. Concepts such as math and computer science will most likely be present in a very large majority of technical works. 

Authors per work will also likely focus the dataset away from works with very broad scope.