In [5]:
# ======================================
# PDF Scientific Paper Text Extractor
# (ready for OpenAI subject-relation-object data extraction)
# ======================================
import os
import re
import fitz  # PyMuPDF

In [6]:
# --- Config & File Paths ---
PDF_FILENAME = "2021Bouza.pdf"            # Change to your PDF name
RAW_DIR = "data/raw"
PROCESSED_DIR = "data/processed"
BASENAME = os.path.splitext(PDF_FILENAME)[0]
PYMUPDF_TXT_PATH = os.path.join(PROCESSED_DIR, f"{BASENAME}_pymupdf.txt")
CLEANED_OUTPUT_PATH = os.path.join(PROCESSED_DIR, f"{BASENAME}_cleaned_for_llm.txt")


# --- Directory Helpers ---
def ensure_dir_exists(folder):
    """Create folder if it doesn't exist."""
    if not os.path.exists(folder):
        os.makedirs(folder)

ensure_dir_exists(PROCESSED_DIR)


In [7]:
# --- PDF Extraction with PyMuPDF ---
def extract_text_with_pymupdf(pdf_path, output_txt_path):
    """
    Extracts all text from a PDF using PyMuPDF, one page at a time.
    Each page is separated for easier debugging and later parsing.
    """
    doc = fitz.open(pdf_path)
    with open(output_txt_path, "w", encoding="utf-8") as out:
        for i, page in enumerate(doc, start=1):
            out.write(f"\n\n--- Page {i} ---\n\n")
            out.write(page.get_text())
    print(f"[PyMuPDF] Extracted text written to {output_txt_path}")

pdf_path = os.path.join(RAW_DIR, PDF_FILENAME)
if not os.path.isfile(PYMUPDF_TXT_PATH):
    extract_text_with_pymupdf(pdf_path, PYMUPDF_TXT_PATH)

In [8]:
# --- Cleaning function ---
def clean_pymupdf_text(raw_text):
    """
    Cleans up PyMuPDF-extracted text from scientific PDFs:
    - Removes repeated headers/footers, page numbers, timestamps
    - Merges lines into paragraphs
    - Removes common journal artifacts
    """
    patterns_to_remove = [
        r'AIP Advances', r'ARTICLE', r'scitation\.org/journal/adv',
        r'©\s?20\d{2}', r'\d{2} July 20\d{2} \d{2}:\d{2}:\d{2}', r'Page \d+', r'--- Page \d+ ---',
        r'Articles You May Be Interested In.*?(?=\n)',  # lines starting with this
        r'https://doi\.org/\S+', r'All article content,.*?license',
        r'\d{1,2},? ?\d{4}',  # random date lines
    ]
    for pat in patterns_to_remove:
        raw_text = re.sub(pat, '', raw_text, flags=re.IGNORECASE | re.MULTILINE)

    # Remove excessive blank lines (keep 2 max)
    raw_text = re.sub(r'\n\s*\n\s*\n+', '\n\n', raw_text)

    # Remove leading/trailing whitespace from lines
    lines = [line.strip() for line in raw_text.split('\n')]
    raw_text = '\n'.join(lines)

    # Merge broken lines into paragraphs, but keep headings/sections
    def merge_lines(text):
        merged = []
        lines = text.split('\n')
        buffer = ''
        for line in lines:
            # Headings or section starts: flush buffer, keep as is
            if (not line.strip()) or re.match(r'^[A-Z][A-Z \-\d\.]{4,}$', line) or re.match(r'^(I{1,4}|V?I{0,3}|X{0,1})\. ', line):
                if buffer:
                    merged.append(buffer.strip())
                    buffer = ''
                if line.strip():
                    merged.append(line.strip())
            else:
                if buffer and not buffer.endswith((' ', '-', '–', '—')):
                    buffer += ' '
                buffer += line.strip()
        if buffer:
            merged.append(buffer.strip())
        return '\n\n'.join(merged)

    cleaned_text = merge_lines(raw_text)
    cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
    cleaned_text = cleaned_text.strip()
    return cleaned_text

# --- Read and clean the PyMuPDF text ---
with open(PYMUPDF_TXT_PATH, "r", encoding="utf-8") as f:
    raw_text = f.read()
cleaned_text = clean_pymupdf_text(raw_text)

In [9]:
# --- Extract metadata and sections from cleaned text ---
def extract_metadata_from_cleaned_text(cleaned_text):
    """
    Extracts title, authors, abstract, and sections from cleaned text.
    """
    # Grab first non-empty line as title
    lines = [line.strip() for line in cleaned_text.splitlines() if line.strip()]
    title = lines[0] if lines else ""
    # Grab next line(s) as authors if contains ';' or ',' (naive, works for most scientific papers)
    author_line = next((line for line in lines[1:6] if ';' in line or ',' in line), "")
    authors = author_line

    # Abstract: text after 'ABSTRACT' until the first section heading
    abstract_match = re.search(r'ABSTRACT\s*(.*?)\n(?:[IVX]+\. [A-Z \-]+)', cleaned_text, re.DOTALL)
    abstract = abstract_match.group(1).strip() if abstract_match else ""

    # Sections: heading + block until next heading (I., II., etc.)
    sections = re.findall(r'\n([IVX]+\. [A-Z \-]+)\n(.*?)(?=\n[IVX]+\. [A-Z \-]+\n|$)', cleaned_text, re.DOTALL)

    return title, authors, abstract, sections

title, authors, abstract, sections = extract_metadata_from_cleaned_text(cleaned_text)


In [10]:
# --- Display Results ---
print("="*60)
print("Title:", title)
print("Authors:", authors)
print("Abstract (truncated):", (abstract[:400] + '...') if len(abstract) > 400 else abstract)
print("\nSections (first 2 shown):")
for heading, content in sections[:2]:
    print(f"\n{heading.strip()}\n{content.strip()[:400]}{'...' if len(content.strip()) > 400 else ''}")
print("="*60)

Title: ---  ---
Authors:  View Online  Export Citation RESEARCH  |  DECEMBER The spectrum of a 1-μm-wavelength-driven tin microdroplet laser-produced plasma source in the 5.5–265.5 nm wavelength range Z. Bouza ; J. Byers ; J. Scheers ; R. Schupp ; Y. Mostafa; L. Behnke; Z. Mazzotta ; J. Sheil ; W. Ubachs ; R. Hoekstra ; M. Bayraktar ; O. O. Versolato  03 (2021)
Abstract (truncated): We present a calibrated spectrum in the 5.5–265.5 nm range from a microdroplet-tin Nd:YAG-laser-produced plasma under conditions relevant for the production of extreme ultraviolet (EUV) light at 13.5 nm for nanolithography. The plasma emission spectrum obtained using a custom-built transmission grating spectrometer results from a careful calibration of a series of filters enabling measurements fre...

Sections (first 2 shown):

I. INTRODUCTION
Laser-produced plasma (LPP) generated from liquid tin (Sn) microdroplets provides extreme ultraviolet (EUV) light for mod-ern nanolithography,1–7 enabling the cont

In [11]:
# --- Save structured, cleaned output ---
with open(CLEANED_OUTPUT_PATH, "w", encoding="utf-8") as out:
    out.write(f"Title: {title}\n")
    out.write(f"Authors: {authors}\n")
    out.write(f"Abstract: {abstract}\n\n")
    for heading, content in sections:
        out.write(f"{heading.strip()}\n{content.strip()}\n\n")
print(f"\n[Done] Cleaned output ready for OpenAI input at: {CLEANED_OUTPUT_PATH}")


[Done] Cleaned output ready for OpenAI input at: data/processed/2021Bouza_cleaned_for_llm.txt
