### Instructions for Running Grobid Docker

Before running this script, make sure you have Grobid running as a Docker container.

You can start Grobid using the following command:

```docker run --rm -p 8070:8070 lfoppiano/grobid:0.8.0```

This exposes the Grobid REST API at `http://localhost:8070`.

In [None]:
import os
import fitz  # PyMuPDF
from pdfminer.high_level import extract_text
import requests
from bs4 import BeautifulSoup
from lxml import etree
import re

In [36]:
# ==== Paths & Helpers ====
def get_file_path(filename, folder):
    """Return the full path to filename in a folder."""
    return os.path.join(os.getcwd(), folder, filename)

def ensure_dir_exists(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)

In [38]:
# ==== File Structure ====
PDF_FILENAME = "2021Bouza.pdf"
RAW_DIR = "data/raw"
PROCESSED_DIR = "data/processed"

ensure_dir_exists(PROCESSED_DIR)

# Input
pdf_path = get_file_path(PDF_FILENAME, RAW_DIR)
out_base = os.path.splitext(PDF_FILENAME)[0]
# Outputs
pymupdf_output = get_file_path(f"{out_base}_pymupdf.txt", PROCESSED_DIR)
pdfminer_output = get_file_path(f"{out_base}_pdfminer.txt", PROCESSED_DIR)
tei_output = get_file_path(f"{out_base}_tei.xml", PROCESSED_DIR)

In [39]:
# ==== PyMuPDF Extraction ====
def extract_with_pymupdf(pdf_path, output_path):
    doc = fitz.open(pdf_path)
    with open(output_path, "w", encoding="utf-8") as out:
        for i, page in enumerate(doc, start=1):
            out.write(f"\n\n--- Page {i} ---\n\n")
            out.write(page.get_text())
    print(f"[PyMuPDF] Text extracted to {output_path}")

extract_with_pymupdf(pdf_path, pymupdf_output)

[PyMuPDF] Text extracted to /Users/jamesbyers/code/github/knowledge_graphs/data/processed/2021Bouza_pymupdf.txt


In [40]:
# ==== PDFMiner Extraction ====
def extract_with_pdfminer(pdf_path, output_path):
    text = extract_text(pdf_path)
    with open(output_path, "w", encoding="utf-8") as out:
        out.write(text)
    print(f"[PDFMiner] Text extracted to {output_path}")

extract_with_pdfminer(pdf_path, pdfminer_output)

Cannot set gray stroke color because /'P12' is an invalid float value
Cannot set gray non-stroke color because /'P12' is an invalid float value


[PDFMiner] Text extracted to /Users/jamesbyers/code/github/knowledge_graphs/data/processed/2021Bouza_pdfminer.txt


In [41]:
# ==== Grobid Extraction ====
def parse_pdf_with_grobid(pdf_path, output_path, service_url="http://localhost:8070/api/processFulltextDocument"):
    with open(pdf_path, 'rb') as pdf_file:
        files = {'input': (pdf_path, pdf_file, 'application/pdf')}
        response = requests.post(service_url, files=files)
        if response.status_code == 200:
            with open(output_path, "w", encoding="utf-8") as out:
                out.write(response.text)
            print(f"[Grobid] TEI XML written to {output_path}")
            return output_path
        else:
            raise Exception(f"Grobid error: {response.status_code} {response.text}")

parse_pdf_with_grobid(pdf_path, tei_output)


[Grobid] TEI XML written to /Users/jamesbyers/code/github/knowledge_graphs/data/processed/2021Bouza_tei.xml


'/Users/jamesbyers/code/github/knowledge_graphs/data/processed/2021Bouza_tei.xml'

In [42]:
# ==== TEI Parsing with BeautifulSoup & lxml ====
def parse_tei_xml(tei_file):
    with open(tei_file, "r", encoding="utf-8") as f:
        tei_text = f.read()
    soup = BeautifulSoup(tei_text, "lxml-xml")
    tree = etree.fromstring(tei_text.encode('utf-8'))
    NS = {"tei": "http://www.tei-c.org/ns/1.0"}

    # Title
    title = soup.find('titleStmt').find('title').text.strip() if soup.find('titleStmt') else ""
    # Authors
    authors = []
    for author in tree.findall('.//tei:titleStmt/tei:author', namespaces=NS):
        pers = author.find('tei:persName', namespaces=NS)
        if pers is not None:
            surname = pers.find('tei:surname', namespaces=NS)
            forename = pers.find('tei:forename', namespaces=NS)
            full_name = " ".join([
                forename.text.strip() if forename is not None else "",
                surname.text.strip() if surname is not None else ""
            ]).strip()
            if full_name:
                authors.append(full_name)
    # Abstract
    abstract_el = tree.find('.//tei:abstract', namespaces=NS)
    abstract = " ".join(p.text.strip() for p in abstract_el.findall('.//tei:p', namespaces=NS)) if abstract_el is not None else ""
    # Sections
    sections = []
    for div in tree.findall('.//tei:text/tei:body/tei:div[@type="section"]', namespaces=NS):
        head = div.find('tei:head', namespaces=NS)
        head_text = head.text.strip() if head is not None else ""
        paragraphs = [p.text.strip() for p in div.findall('tei:p', namespaces=NS) if p.text]
        sections.append({'heading': head_text, 'paragraphs': paragraphs})
    return title, authors, abstract, sections

tei_title, tei_authors, tei_abstract, tei_sections = parse_tei_xml(tei_output)

In [43]:
# ==== PyMuPDF Regex Extraction (as fallback or comparison) ====
def extract_sections_from_txt(txt_file):
    with open(txt_file, "r", encoding="utf-8") as f:
        text = f.read()
    # Example regex extraction (adapt as needed):
    abstract = re.search(r"ABSTRACT\s*(.*?)\s*I\. INTRODUCTION", text, re.DOTALL)
    abstract = abstract.group(1).strip() if abstract else ""
    sections = re.findall(r"\n([IVX]+\. [A-Z \-]+)\n(.*?)(?=\n[IVX]+\. |$)", text, re.DOTALL)
    section_data = []
    for heading, content in sections:
        paras = [p.strip() for p in content.strip().split('\n') if p.strip()]
        section_data.append({'heading': heading.strip(), 'paragraphs': paras})
    return abstract, section_data

pymupdf_abstract, pymupdf_sections = extract_sections_from_txt(pymupdf_output)


In [11]:
from lxml import etree

tei_file = "2021Bouza_tei.xml"
tree = etree.parse(tei_file)
root = tree.getroot()

NS = {"tei": "http://www.tei-c.org/ns/1.0"}

# Title
title = root.find('.//tei:titleStmt/tei:title', namespaces=NS)
print("Title:", title.text.strip() if title is not None else "")

# Authors (main article authors only)
authors = []
for author in root.findall('.//tei:titleStmt/tei:author', namespaces=NS):
    pers = author.find('tei:persName', namespaces=NS)
    if pers is not None:
        surname = pers.find('tei:surname', namespaces=NS)
        forename = pers.find('tei:forename', namespaces=NS)
        full_name = " ".join([forename.text.strip() if forename is not None else "", 
                              surname.text.strip() if surname is not None else ""]).strip()
        if full_name:
            authors.append(full_name)
print("Authors:", authors)

# Abstract
abstract = root.find('.//tei:abstract', namespaces=NS)
if abstract is not None:
    abs_text = " ".join(p.text.strip() for p in abstract.findall('.//tei:p', namespaces=NS) if p.text)
    print("Abstract:", abs_text)
else:
    print("Abstract:")

# Sections
print("\nSections:")
for div in root.findall('.//tei:text/tei:body/tei:div[@type="section"]', namespaces=NS):
    head = div.find('tei:head', namespaces=NS)
    head_text = head.text.strip() if head is not None else ""
    paragraphs = [p.text.strip() for p in div.findall('tei:p', namespaces=NS) if p.text]
    print(f"\n== {head_text} ==")
    for para in paragraphs:
        print(para)


Title: The spectrum of a 1-μm-wavelength-driven tin microdroplet laser-produced plasma source in the 5.5-265.5 nm wavelength range
Authors: []
Abstract: Production of 13.5 nm light with 5% conversion efficiency from 2 μ m laser-driven tin microdroplet plasma

Sections:


In [51]:
# ==== Outputs ====

print("\n" + "="*80)
print(f"TEI Title: {tei_title}")
print(f"TEI Authors: {tei_authors}")
print(f"TEI Abstract: {tei_abstract[:300]}...")
print("TEI Sections:")
for section in tei_sections[:3]:  # print first 3 sections for brevity
    print(f"  - {section['heading']}: {section['paragraphs'][0][:80]}..." if section['paragraphs'] else f"  - {section['heading']}: [No content]")
print("="*80)
print(f"\nPyMuPDF Abstract:\n{pymupdf_abstract[:300]}")
print("\nPyMuPDF Sections (first 2):")
for section in pymupdf_sections[:2]:
    print(f"  - {section['heading']}: {section['paragraphs'][0][:80]}..." if section['paragraphs'] else f"  - {section['heading']}: [No content]")
print("="*80)


TEI Title: The spectrum of a 1-μm-wavelength-driven tin microdroplet laser-produced plasma source in the 5.5-265.5 nm wavelength range
TEI Authors: []
TEI Abstract: Production of 13.5 nm light with 5% conversion efficiency from 2 μ m laser-driven tin microdroplet plasma...
TEI Sections:

PyMuPDF Abstract:
We present a calibrated spectrum in the 5.5–265.5 nm range from a microdroplet-tin Nd:YAG-laser-produced plasma under conditions
relevant for the production of extreme ultraviolet (EUV) light at 13.5 nm for nanolithography. The plasma emission spectrum obtained
using a custom-built transmission grat

PyMuPDF Sections (first 2):
  - I. INTRODUCTION: Laser-produced plasma (LPP) generated from liquid tin (Sn)...
  - II. EXPERIMENTAL SETUP: In our experiments, molten tin microdroplets of 46 μm dia-...


In [56]:
tei_sections

[]

In [57]:
import re

with open("/Users/jamesbyers/code/github/knowledge_graphs/2021Bouza_pymupdf.txt", "r") as f:
    text = f.read()

# Title: first non-empty line
title = re.search(r"(?<=\| )(.*?nm wavelength range)", text, re.DOTALL)
if title:
    title = title.group(1).strip()

# Abstract: between 'ABSTRACT' and 'I. INTRODUCTION'
abstract = re.search(r"ABSTRACT\s*(.*?)\s*I\. INTRODUCTION", text, re.DOTALL)
if abstract:
    abstract = abstract.group(1).strip()

# Authors: first block after title, before affiliations
authors_block = re.search(r"wavelength range\s+(.*?)\s+AFFILIATIONS", text, re.DOTALL)
if authors_block:
    authors = authors_block.group(1).replace('\n', ' ').strip()

# Sections: e.g., find all section headings and paragraphs
sections = re.findall(r"\n([IVX]+\. [A-Z \-]+)\n(.*?)(?=\n[IVX]+\. |$)", text, re.DOTALL)

# Print result
print("Title:", title)
print("Authors:", authors)
print("Abstract:", abstract)
print("Sections:")
for heading, content in sections:
    print("\n", heading.strip())
    print(content.strip()[:300], "...")


Title: DECEMBER 02 2021
The spectrum of a 1-μm-wavelength-driven tin microdroplet
laser-produced plasma source in the 5.5–265.5 nm
wavelength range
Z. Bouza 
 ; J. Byers 
 ; J. Scheers 
 ; R. Schupp 
 ; Y. Mostafa; L. Behnke; Z. Mazzotta 
 ; J. Sheil 
 ;
W. Ubachs 
 ; R. Hoekstra 
 ; M. Bayraktar 
 ; O. O. Versolato  
AIP Advances 11, 125003 (2021)
https://doi.org/10.1063/5.0073839
Articles You May Be Interested In
Radiation transport and scaling of optical depth in Nd:YAG laser-produced microdroplet-tin plasma
Appl. Phys. Lett. (September 2019)
Production of 13.5 nm light with 5% conversion efficiency from 2 μ m laser-driven tin microdroplet plasma
Appl. Phys. Lett. (December 2023)
Laser-induced vaporization of a stretching sheet of liquid tin
J. Appl. Phys. (February 2021)
 05 July 2025 13:58:50


--- Page 2 ---

AIP Advances
ARTICLE
scitation.org/journal/adv
The spectrum of a 1-μm-wavelength-driven tin
microdroplet laser-produced plasma source
in the 5.5–265.5 nm wavelength range
A

In [58]:
sections[0]

('I. INTRODUCTION',
 'Laser-produced plasma (LPP) generated from liquid tin (Sn)\nmicrodroplets provides extreme ultraviolet (EUV) light for mod-\nern nanolithography,1–7 enabling the continued reduction of feature\nsizes on affordable integrated circuits (ICs). Such laser-produced\nplasmas of tin are characterized by a strong emission peak near\n13.5 nm, originating from transitions between complex excited\nstates in multiply charged Sn10+–Sn15+ ions.8–17\nMultilayer optics are used in industrial lithography machines\nto collect the EUV light from its source and to provide an image\nof the so-called mask onto the wafer. These optics are designed to\nreflect wavelengths in a 2%-wavelength bandwidth centered around\n13.5 nm (the bandwidth limitation is, in part, due to the many ∼10\nrequired reflective surfaces).18,19 As such, most spectroscopic works\non Sn LPPs have focused on the “in-band” wavelength region17,20–24\nor on nearby out-of-band (OOB) EUV emission features,14,23,25–31\nsp