In [1]:
from grobid_client.grobid_client import GrobidClient
from lxml import etree
from glob import glob
from re import sub
from pathlib import Path
from json import dump
from datetime import datetime

In [2]:
FIELDS = ["title", "authors", "affiliations", "publication_date", "publisher", "doi", "keywords", "abstract"]
grobig_config = "./grobid_config.json"
pdf_dir = "../../data/metadata_extraction_data/demo"
tei_dir = "../../data/metadata_extraction_data/grobid_demo_tei"
xml_files_path = Path(tei_dir) / "*.xml"

metadata_dir = Path("../../data/metadata_extraction_data/grobid_demo_metadata")
metadata_dir.mkdir(parents=True, exist_ok=True)

In [3]:
def parse_grobid_tei(tei_path) -> dict:
    ns = {"tei": "http://www.tei-c.org/ns/1.0"}
    root = etree.parse(tei_path)

    # helpers ---------------------------------------------------------------
    tex = lambda xp: sub(r"\s+", " ", (root.findtext(xp, namespaces=ns) or "").strip())
    many = lambda xp: list(dict.fromkeys([sub(r"\s+", " ", " ".join(s.itertext()).strip()) for s in root.findall(xp, namespaces=ns)]))
    def date(xp):
        date_value = root.xpath(xp, namespaces=ns)[0] if root.xpath(xp, namespaces=ns) else ""
        if date_value:
            try:
                date_value = datetime.strptime(date_value, "%Y-%m-%d").strftime('%d-%m-%Y')
            except ValueError:
                try:
                    date_value = datetime.strptime(date_value, "%Y-%m").strftime('%m-%Y')
                except ValueError:
                    date_value = datetime.strptime(date_value, "%Y").strftime('%Y')
        return date_value
            

    data = {
        "title": tex(".//tei:titleStmt/tei:title"),
        "authors": many(".//tei:persName"),
        "affiliations": many(".//tei:affiliation"),
        "publication_date": date(".//tei:publicationStmt/tei:date/@when"),
        "publisher": tex(".//tei:publicationStmt/tei:publisher"),
        "doi": tex('.//tei:idno[@type="DOI"]'),
        "keywords": many(".//tei:profileDesc/tei:textClass/tei:keywords/tei:term"),
        "abstract": " ".join(root.find(".//tei:abstract", namespaces=ns).itertext()).strip()
                     if root.find(".//tei:abstract", namespaces=ns) is not None else ""
    }
    # keep only requested keys, ensure ordering
    return {k: data.get(k, "") for k in FIELDS}

In [4]:
client = GrobidClient(config_path=grobig_config)
client.process(
    service="processHeaderDocument",
    input_path=pdf_dir,
    output=tei_dir,
    consolidate_header=True,
)

INFO - Loading configuration file from ./grobid_config.json
INFO - Configuration file loaded successfully
2025-09-17 23:12:48,296 - grobid_client.grobid_client - INFO - Logging configured - Level: INFO, Console: True, File: disabled
2025-09-17 23:12:48,511 - grobid_client.grobid_client - INFO - GROBID server http://localhost:8070 is up and running
2025-09-17 23:12:48,513 - grobid_client.grobid_client - INFO - Found 1 file(s) to process
2025-09-17 23:12:50,437 - grobid_client.grobid_client - INFO - Processing completed: 1 out of 1 files processed


In [5]:
xml_files = glob(str(xml_files_path))
for f in xml_files:
    file_name = Path(f).stem.replace(".tei", "").replace(".grobid", "")
    metadata_write_path = metadata_dir / f"{file_name}.json"
    if not metadata_write_path.is_file():
        print(metadata_write_path)
        metadata = parse_grobid_tei(f)
        with open(metadata_write_path, 'w', encoding='utf-8') as f:
            dump(metadata, f, ensure_ascii=False, indent=4)
    else:
        print("File exists")

File exists


In [6]:
len(xml_files)

1