# Code to extract ORCID publications

In [1]:
# Requirements
!pip install python-docx pandas requests

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
from datetime import datetime
from docx import Document
import requests

In [3]:
# Load researcher data from Excel
file_path = "ceda_orcid_combined_summary.xlsx"
orcid_df = pd.read_excel(file_path, sheet_name="Researcher Data", engine="openpyxl")
start_year, end_year = 2024, 2025

# Define date range for filtering
start_date = datetime(start_year, 4, 1)
end_date = datetime(end_year, 3, 31)

In [4]:
def get_common_name(x):
    "Get common name from (SURNAME, FIRSTNAME) format."
    parts = x.split(",")
    name = f"{parts[-1].strip()} {parts[0].strip()}"
    return name.title()

orcid_df["common_name"] = orcid_df.Name.apply(lambda name: get_common_name(name))

In [5]:
orcid_df.head()

Unnamed: 0,Name,ORCID,Number of Works for 24/25,Type of Works for 24/25,common_name
0,"ALEXANDER, JESSE",0009-0006-2877-3197,4,preprint: 1\njournal-article: 1\nconference-po...,Jesse Alexander
1,"ANDERSON, EMILY",0009-0005-7426-840X,0,None specified,Emily Anderson
2,"CONWAY, ESTHER",0000-0002-7796-7661,6,report: 5\njournal-article: 1,Esther Conway
3,"DONEGAN, STEPHEN",0000-0002-4609-9427,0,None specified,Stephen Donegan
4,"EVANS, RHYS",0009-0006-3575-578X,4,conference-presentation: 3\nconference-poster: 1,Rhys Evans


In [6]:
def get_json(url):
    "Function to download content and return as a dictionary."
    headers = {"Accept": "application/json"}
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        return {}
    
    return response.json()

In [7]:
# Function to query ORCID API and extract relevant works
def get_contribs(work_path):
    "Returns all contributors for a specific work."
    url = url = 'https://pub.orcid.org/v3.0' + work_path
    data = get_json(url)
    contribs = [contrib["credit-name"]["value"] for contrib in data["contributors"]["contributor"]]

    return [f"{c.split()[-1]}, {' '.join(c.split()[:-1])}" for c in contribs]


def get_orcid_works(person):
    "Return all works for user."
    orcid_id = orcid_df[orcid_df.common_name == person]["ORCID"].iat[0]
    url = f"https://pub.orcid.org/v3.0/{orcid_id}/works"
    data = get_json(url)
    works = []
    
    for group in data.get("group", []):
        for work_summary in group.get("work-summary", []):
            pub_date = work_summary.get("publication-date")
            year = pub_date.get("year", {}).get("value") if pub_date and pub_date.get("year") else None
            if not year: continue
                
            month = pub_date.get("month", {}).get("value") if pub_date and pub_date.get("month") else 1
            day = pub_date.get("day", {}).get("value") if pub_date and pub_date.get("day") else 1
            pub_datetime = datetime(int(year), int(month), int(day))

            if start_date <= pub_datetime <= end_date:
                title = work_summary.get("title", {}).get("title", {}).get("value", "")
                # Clean up title
                title = title.replace("&#160;", "")
                
                type_ = work_summary.get("type", "")
                external_ids = work_summary.get("external-ids", {}).get("external-id", [])
                doi = next((eid.get("external-id-value") for eid in external_ids if eid.get("external-id-type") == "doi"), "")

                work_path = work_summary.get("path", None)
                authors = get_contribs(work_path)
                #authors = df[df["ORCID"] == orcid_id]["Name"].values[0]
                citation = f"{'; '.join(authors)} ({year}). {title}. DOI: {doi}. [{type_}]"
                works.append(citation)

    return works

In [8]:
def get_all_citations():
    "Collect all citations and return as a sorted list."
    citations = []
    for name in orcid_df["common_name"].dropna():
        print(f"[INFO] Getting citations for: {name}")
        citations.extend(get_orcid_works(name))
    
    # Sort citations alphabetically
    return sorted(citations)

In [10]:
citations = get_all_citations()

[INFO] Getting citations for: Jesse Alexander
[INFO] Getting citations for: Emily Anderson
[INFO] Getting citations for: Esther Conway
[INFO] Getting citations for: Stephen Donegan
[INFO] Getting citations for: Rhys Evans
[INFO] Getting citations for: Nicola Farmer
[INFO] Getting citations for: Ellie Fisher
[INFO] Getting citations for: Wendy Elizabeth Garland
[INFO] Getting citations for: Hayley Gray
[INFO] Getting citations for: Matthew Jones
[INFO] Getting citations for: Martin Juckes
[INFO] Getting citations for: Philip Kershaw
[INFO] Getting citations for: Diane Knappett
[INFO] Getting citations for: Danny Lloyd
[INFO] Getting citations for: Molly Macrae
[INFO] Getting citations for: Ag Stephens
[INFO] Getting citations for: Neil Massey
[INFO] Getting citations for: Chaminuka Mbanje
[INFO] Getting citations for: Matthew Paice
[INFO] Getting citations for: Alison Pamment
[INFO] Getting citations for: Graham Parton
[INFO] Getting citations for: Charlotte Pascoe
[INFO] Getting citati

In [11]:
def create_word_doc(citations):
    "Write the citations to a Word docx."
    
    # Create Word document
    doc = Document()
    doc.add_heading(f"CEDA Staff Publications ({start_year}-{end_year})", level=1)
    
    for citation in citations:
        doc.add_paragraph(citation)
    
    # Save document
    doc_name = f"CEDA_Publications_{start_year}_{end_year}.docx"
    doc.save(doc_name)
    
    print(f"Document saved as '{doc_name}'.")

In [13]:
create_word_doc(citations)

Document saved as 'CEDA_Publications_2024_2025.docx'.


In [None]:
# Ideas for future improvements: 
# - CEDA authors to be in bold
# - summary table with total number of each work types
# - summary text which says all CEDA authors are marked in bold
# - make it generic (i.e. spreadsheet name) so others can use it e.g. RAL Space, EDS 
# - remove duplications, use the all versions DOI 
# - add URL link rather than just the plain text DOI 