Skip to content

Commit

Permalink
ENH: Improve PDFium text extraction (#11)
Browse files Browse the repository at this point in the history
Several additional changes:

* ENH: Add PDFium image extraction
* ROB: Make opening/parsing the cache file more robust
* MAINT: Update deprecated pdantic API
* MAINT: Add pdfrw to main.in
  • Loading branch information
mqq-marek committed Oct 31, 2023
1 parent 4f14b3c commit 24c51dd
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 13 deletions.
15 changes: 9 additions & 6 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import time
from io import BytesIO
from itertools import product
from json import JSONDecodeError
from pathlib import Path
from typing import Literal

Expand Down Expand Up @@ -34,7 +35,7 @@
pymupdf_watermarking,
pypdf_get_text,
pypdf_image_extraction,
pypdf_watermarking,
pypdf_watermarking, tika_get_text, pdfium_image_extraction,
)
from pdf_benchmark.output import write_benchmark_report
from pdf_benchmark.score import get_text_extraction_score
Expand All @@ -48,8 +49,11 @@ def main(
) -> None:
cache_path = Path("cache.json")
if cache_path.exists():
with open(cache_path) as f:
cache = Cache.parse_obj(json.load(f))
try:
with open(cache_path) as f:
cache = Cache.model_validate(json.load(f))
except JSONDecodeError:
cache = Cache()
else:
cache = Cache()
names = sorted(list(libraries.keys()))
Expand Down Expand Up @@ -154,9 +158,7 @@ def write_single_result(
"Tika",
"tika",
"https://pypi.org/project/tika/",
text_extraction_function=lambda n: parser.from_buffer(BytesIO(n))[
"content"
],
text_extraction_function=tika_get_text,
version=tika.__version__,
dependencies="Apache Tika",
license="Apache v2",
Expand Down Expand Up @@ -233,6 +235,7 @@ def write_single_result(
text_extraction_function=pdfium_get_text,
version=pypdfium2.V_PYPDFIUM2,
watermarking_function=None,
image_extraction_function=pdfium_image_extraction,
license="Apache-2.0 or BSD-3-Clause",
last_release_date="2023-07-04",
dependencies="PDFium (Foxit/Google)",
Expand Down
2 changes: 1 addition & 1 deletion pdf_benchmark/data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,4 @@ def has_doc(self, library: Library, document: Document) -> bool:

def write(self, path: Path):
with open(path, "w") as f:
f.write(self.json(indent=4, sort_keys=True))
f.write(self.model_dump_json(indent=4))
54 changes: 49 additions & 5 deletions pdf_benchmark/library_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
from borb.pdf.pdf import PDF
from borb.toolkit.text.simple_text_extraction import SimpleTextExtraction
from pdfminer.high_level import extract_pages
from requests import ReadTimeout

from .text_extraction_post_processing import postprocess
from .text_extraction_post_processing import postprocess, PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE


def pymupdf_get_text(data: bytes) -> str:
Expand All @@ -32,16 +33,44 @@ def pypdf_get_text(data: bytes) -> str:
return text


def pdfium_new_line_after_hyphens(text):
return text.replace(PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE, PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE + '\n')


def pdfium_get_text(data: bytes) -> str:
text = ""
texts = []
page_labels = []
pdf = pdfium.PdfDocument(data)

for i in range(len(pdf)):
if not (label := pdf.get_page_label(i)):
label = str(i + 1)
page_labels.append(label)
page = pdf.get_page(i)
textpage = page.get_textpage()
text += textpage.get_text_range() + "\n"
texts.append(pdfium_new_line_after_hyphens(textpage.get_text_range()))
text = postprocess(texts, page_labels)
return text


def pdfium_image_extraction(data: bytes) -> list[tuple[str, bytes]]:
images = []
try:
pdf = pdfium.PdfDocument(data)
for i in range(len(pdf)):
page = pdf.get_page(i)
index = 1
for obj in page.get_objects():
if isinstance(obj, pdfium.PdfImage):
img = BytesIO()
obj.extract(img)
images.append((f"page-{i+1}-image-{index}.jpg", img.getvalue()))
index += 1
except Exception as exc:
print(f"pdfium Image extraction failure: {exc}")
return images


def pypdf_watermarking(watermark_data: bytes, data: bytes) -> bytes:
watermark_pdf = pypdf.PdfReader(BytesIO(watermark_data))
watermark_page = watermark_pdf.pages[0]
Expand Down Expand Up @@ -87,7 +116,7 @@ def pymupdf_image_extraction(data: bytes) -> list[tuple[str, bytes]]:
image_bytes = base_image["image"]
image_ext = base_image["ext"]
images.append(
(f"image{page_index+1}_{image_index}.{image_ext}", image_bytes)
(f"image{page_index + 1}_{image_index}.{image_ext}", image_bytes)
)
return images

Expand Down Expand Up @@ -170,7 +199,10 @@ def pdftotext_get_text(data: bytes) -> str:
new_file, filename = tempfile.mkstemp()
with open(filename, "wb") as fp:
fp.write(data)
args = ["/usr/bin/pdftotext", "-enc", "UTF-8", filename, "-"]
pdf_to_text_path = "/usr/bin/pdftotext"
if not os.path.exists(pdf_to_text_path):
pdf_to_text_path = 'pdftotext'
args = [pdf_to_text_path, "-enc", "UTF-8", filename, "-"]
res = subprocess.run(args, capture_output=True)
output = res.stdout.decode("utf-8")
os.close(new_file)
Expand All @@ -191,3 +223,15 @@ def pdfrw_watermarking(watermark_data: bytes, data: bytes) -> bytes:

out_buffer.seek(0)
return out_buffer.read()


def tika_get_text(data: bytes) -> str:
from tika import parser

try:
return parser.from_buffer(BytesIO(data), requestOptions={"timeout": (1, 100)})[
"content"
]
except ReadTimeout as ex:
print("Tika timeout:", ex)
return "[[[Tika text extraction failed!]]]"
4 changes: 3 additions & 1 deletion pdf_benchmark/text_extraction_post_processing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE = "\ufffe"

def postprocess(extracted_texts: list[str], page_labels: list[str]) -> str:
"""Pass a list of all extracted texts from all pages."""
extracted_texts = [replace_ligatures(t) for t in extracted_texts]
Expand Down Expand Up @@ -30,7 +32,7 @@ def remove_hyphens(text: str) -> str:
# Find dashes
line_numbers = []
for line_no, line in enumerate(lines[:-1]):
if line.endswith("-"):
if line.endswith("-") or line.endswith(PDFIUM_ZERO_WIDTH_NO_BREAK_SPACE):
line_numbers.append(line_no)

# Replace
Expand Down
1 change: 1 addition & 0 deletions requirements/main.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ pdftotext
pydantic
pymupdf
pypdfium2
pdfrw
lxml

0 comments on commit 24c51dd

Please sign in to comment.