In [1]:
from grobid_client.grobid_client import GrobidClient
from lxml import etree
from glob import glob
from re import sub
from pathlib import Path
from json import dump

In [2]:
FIELDS = ["title", "authors", "affiliations", "publication_date", "publisher", "doi", "keywords", "abstract"]
grobig_config = "./grobid_config.json"
pdf_dir = "../data/metadata_extraction_data/pdf"
tei_dir = "../data/metadata_extraction_data/grobid_tei"
xml_files_path = Path(tei_dir) / "*.xml"

metadata_dir = Path("../data/metadata_extraction_data/grobid_metadata")
metadata_dir.mkdir(parents=True, exist_ok=True)

In [None]:
def parse_grobid_tei(tei_path) -> dict:
    ns = {"tei": "http://www.tei-c.org/ns/1.0"}
    root = etree.parse(tei_path)

    # helpers ---------------------------------------------------------------
    tex = lambda xp: sub(r"\s+", " ", (root.findtext(xp, namespaces=ns) or "").strip())
    many = lambda xp: list(dict.fromkeys([sub(r"\s+", " ", " ".join(s.itertext()).strip()) for s in root.findall(xp, namespaces=ns)]))


    data = {
        "title": tex(".//tei:titleStmt/tei:title"),
        "authors": many(".//tei:persName"),
        "affiliations": many(".//tei:affiliation"),
        "publication_date": tex(".//tei:publicationStmt/tei:date"),
        "publisher": tex(".//tei:publicationStmt/tei:publisher"),
        "doi": tex('.//tei:idno[@type="DOI"]'),
        "keywords": many(".//tei:profileDesc/tei:textClass/tei:keywords/tei:term"),
        "abstract": " ".join(root.find(".//tei:abstract", namespaces=ns).itertext()).strip()
                     if root.find(".//tei:abstract", namespaces=ns) is not None else ""
    }
    # keep only requested keys, ensure ordering
    return {k: data.get(k, "") for k in FIELDS}

In [4]:
client = GrobidClient(config_path=grobig_config)
client.process(
    service="processHeaderDocument",
    input_path=pdf_dir,
    output=tei_dir,
    consolidate_header=True,
)

GROBID server is up and running


In [None]:
xml_files = glob(str(xml_files_path))
for f in xml_files:
    metadata = parse_grobid_tei(f)

    file_name = Path(f).stem.replace(".tei", "")
    metadata_write_path = metadata_dir / f"{file_name}.json"
    with open(metadata_write_path, 'w', encoding='utf-8') as f:
        dump(metadata, f, ensure_ascii=False, indent=4)