In [1]:
import requests
import pandas as pd
from typing import Optional
from tqdm import tqdm
import time
import random


def get_institution_authors_with_topics(institution_id: str, email: Optional[str] = None) -> pd.DataFrame:
    """
    Fetch all current authors from an institution with their research topics and publication counts.

    Args:
        institution_id: OpenAlex institution ID (e.g., 'I887064364' for University of Amsterdam)
        email: Your email for faster API access (highly recommended!)

    Returns:
        DataFrame where each row is one author-topic pair with publication count.
        Only authors currently affiliated with the institution (last_known_institutions) are included.
    """
    base_url = "https://api.openalex.org/authors"
    headers = {'User-Agent': f'mailto:{email}'} if email else {}

    try:
        # Count total authors
        print(f"Counting current authors at institution {institution_id}...")
        count_params = {
            'filter': f'last_known_institutions.id:https://openalex.org/{institution_id}',
            'per_page': 1
        }
        response = requests.get(base_url, params=count_params, headers=headers, timeout=30)
        response.raise_for_status()
        total_authors = response.json()['meta']['count']
        print(f"Found {total_authors} current authors to fetch\n")

        # Fetch all authors
        params = {
            'filter': f'last_known_institutions.id:https://openalex.org/{institution_id}',
            'per_page': 200,
            'cursor': '*'
        }

        all_records = []
        with tqdm(total=total_authors, desc="Fetching authors", unit="authors") as pbar:
            while True:
                # Handle rate limiting and retries
                for attempt in range(3):
                    try:
                        response = requests.get(base_url, params=params, headers=headers, timeout=30)
                        if response.status_code == 429:
                            time.sleep(int(response.headers.get('Retry-After', 5)))
                            continue
                        response.raise_for_status()
                        break
                    except requests.RequestException:
                        if attempt == 2:
                            raise
                        time.sleep(2 ** attempt + random.uniform(0, 1))

                data = response.json()

                # Process authors
                for author in data.get('results', []):
                    author_info = {
                        'author_id': author.get('id', ''),
                        'author_name': author.get('display_name', ''),
                        'orcid': author.get('orcid', ''),
                        'works_count': author.get('works_count', 0),
                        'cited_by_count': author.get('cited_by_count', 0),
                        'h_index': author.get('summary_stats', {}).get('h_index', 0),
                        'i10_index': author.get('summary_stats', {}).get('i10_index', 0)
                    }

                    topics = author.get('topics', [])
                    if topics:
                        for topic in topics:
                            record = author_info.copy()
                            record.update({
                                'topic_id': topic.get('id', ''),
                                'topic_name': topic.get('display_name', ''),
                                'topic_works_count': topic.get('count', 0),
                                'domain': topic.get('domain', {}).get('display_name', ''),
                                'field': topic.get('field', {}).get('display_name', ''),
                                'subfield': topic.get('subfield', {}).get('display_name', '')
                            })
                            all_records.append(record)
                    else:
                        # Author with no topics
                        record = author_info.copy()
                        record.update({
                            'topic_id': None, 'topic_name': None, 'topic_works_count': 0,
                            'domain': None, 'field': None, 'subfield': None
                        })
                        all_records.append(record)

                pbar.update(len(data.get('results', [])))

                # Check for next page
                next_cursor = data.get('meta', {}).get('next_cursor')
                if next_cursor and next_cursor != params['cursor']:
                    params['cursor'] = next_cursor
                else:
                    break

        # Create and sort DataFrame
        df = pd.DataFrame(all_records)
        if not df.empty:
            df = df.sort_values(['cited_by_count', 'author_id', 'topic_works_count'],
                               ascending=[False, True, False]).reset_index(drop=True)

        print(f"\n✓ Successfully fetched {df['author_id'].nunique() if not df.empty else 0} unique current authors")
        print(f"✓ Total rows: {len(df)} (one per author-topic pair)")
        return df

    except Exception as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()

In [2]:
# Fetch data for University of Amsterdam
df = get_institution_authors_with_topics('I887064364', email='sauromania@gmail.com')

df

Counting current authors at institution I887064364...
Found 37423 current authors to fetch



Fetching authors: 100%|██████████| 37423/37423 [04:51<00:00, 128.44authors/s]



✓ Successfully fetched 37423 unique current authors
✓ Total rows: 363261 (one per author-topic pair)


Unnamed: 0,author_id,author_name,orcid,works_count,cited_by_count,h_index,i10_index,topic_id,topic_name,topic_works_count,domain,field,subfield
0,https://openalex.org/A5061726857,Patrick M. Bossuyt,https://orcid.org/0000-0003-4427-0128,1165,196934,153,700,https://openalex.org/T10206,Meta-analysis and systematic reviews,246,Social Sciences,Decision Sciences,"Statistics, Probability and Uncertainty"
1,https://openalex.org/A5061726857,Patrick M. Bossuyt,https://orcid.org/0000-0003-4427-0128,1165,196934,153,700,https://openalex.org/T10804,"Health Systems, Economic Evaluations, Quality ...",143,Social Sciences,"Economics, Econometrics and Finance",Economics and Econometrics
2,https://openalex.org/A5061726857,Patrick M. Bossuyt,https://orcid.org/0000-0003-4427-0128,1165,196934,153,700,https://openalex.org/T10552,Colorectal Cancer Screening and Detection,123,Health Sciences,Medicine,Oncology
3,https://openalex.org/A5061726857,Patrick M. Bossuyt,https://orcid.org/0000-0003-4427-0128,1165,196934,153,700,https://openalex.org/T11732,Assisted Reproductive Technology and Twin Preg...,83,Health Sciences,Medicine,"Pediatrics, Perinatology and Child Health"
4,https://openalex.org/A5061726857,Patrick M. Bossuyt,https://orcid.org/0000-0003-4427-0128,1165,196934,153,700,https://openalex.org/T10696,Gastric Cancer Management and Outcomes,75,Health Sciences,Medicine,Pulmonary and Respiratory Medicine
...,...,...,...,...,...,...,...,...,...,...,...,...,...
363256,https://openalex.org/A5119429136,Paweł Banaś,,0,0,0,0,,,0,,,
363257,https://openalex.org/A5119435485,Daan Vos de Wael,https://orcid.org/0009-0006-4859-0945,0,0,0,0,,,0,,,
363258,https://openalex.org/A5119465487,Kim B. Meekel,,0,0,0,0,,,0,,,
363259,https://openalex.org/A5119474546,Imme Garrelfs Garrelfs,,0,0,0,0,,,0,,,


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363261 entries, 0 to 363260
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   author_id          363261 non-null  object
 1   author_name        363261 non-null  object
 2   orcid              125667 non-null  object
 3   works_count        363261 non-null  int64 
 4   cited_by_count     363261 non-null  int64 
 5   h_index            363261 non-null  int64 
 6   i10_index          363261 non-null  int64 
 7   topic_id           362485 non-null  object
 8   topic_name         362485 non-null  object
 9   topic_works_count  363261 non-null  int64 
 10  domain             362485 non-null  object
 11  field              362485 non-null  object
 12  subfield           362485 non-null  object
dtypes: int64(5), object(8)
memory usage: 36.0+ MB


In [14]:
import csv

# Enforce dtypes and tidy strings before export
int_cols = ['works_count', 'cited_by_count', 'h_index', 'i10_index', 'topic_works_count']
str_cols = ['author_id', 'author_name', 'orcid', 'topic_id', 'topic_name', 'domain', 'field', 'subfield']

for col in int_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')  # keeps NA as <NA>

for col in str_cols:
    if col in df.columns:
        df[col] = df[col].astype('string').str.strip()

# Stable column order (drop missing gracefully)
ordered = [c for c in (str_cols[:3] + int_cols[:4] + ['topic_id', 'topic_name', 'topic_works_count', 'domain', 'field', 'subfield']) if c in df.columns]
if ordered:
    df = df.loc[:, ordered]

# Write CSV with safe options
output_path = 'data/uva_authors_topics.csv'
df.to_csv(
    output_path,
    index=False,
    encoding='utf-8',          # explicit encoding
    quoting=csv.QUOTE_MINIMAL, # quote only when needed
    lineterminator='\n'       # consistent newlines across platforms
)

print(f"Saved {len(df):,} rows to {output_path}")

Saved 363,261 rows to data/uva_authors_topics.csv
