Skip to content

Commit

Permalink
TST: Text extraction with rotated arabic file (#1793)
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Apr 16, 2023
1 parent 23d81ff commit 86502b9
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 10 deletions.
2 changes: 1 addition & 1 deletion sample-files
34 changes: 25 additions & 9 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
The tested code might be in _page.py.
"""

from pathlib import Path

import pytest

from pypdf import PdfReader
from pypdf._text_extraction import set_custom_rtl

Expand All @@ -15,9 +16,10 @@
SAMPLE_ROOT = PROJECT_ROOT / "sample-files"


def test_multi_language():
@pytest.mark.parametrize(("visitor_text"), [None, lambda a, b, c, d, e: None])
def test_multi_language(visitor_text):
reader = PdfReader(RESOURCE_ROOT / "multilang.pdf")
txt = reader.pages[0].extract_text()
txt = reader.pages[0].extract_text(visitor_text=visitor_text)
assert "Hello World" in txt, "English not correctly extracted"
# iss #1296
assert "مرحبا بالعالم" in txt, "Arabic not correctly extracted"
Expand All @@ -27,15 +29,29 @@ def test_multi_language():
assert "こんにちは世界" in txt, "Japanese not correctly extracted"
# check customizations
set_custom_rtl(None, None, "Russian:")
assert (
":naissuR" in reader.pages[0].extract_text()
assert ":naissuR" in reader.pages[0].extract_text(
visitor_text=visitor_text
), "(1) CUSTOM_RTL_SPECIAL_CHARS failed"
set_custom_rtl(None, None, [ord(x) for x in "Russian:"])
assert (
":naissuR" in reader.pages[0].extract_text()
assert ":naissuR" in reader.pages[0].extract_text(
visitor_text=visitor_text
), "(2) CUSTOM_RTL_SPECIAL_CHARS failed"
set_custom_rtl(0, 255, None)
assert ":hsilgnE" in reader.pages[0].extract_text(), "CUSTOM_RTL_MIN/MAX failed"
assert ":hsilgnE" in reader.pages[0].extract_text(
visitor_text=visitor_text
), "CUSTOM_RTL_MIN/MAX failed"
set_custom_rtl("A", "z", [])
assert ":hsilgnE" in reader.pages[0].extract_text(), "CUSTOM_RTL_MIN/MAX failed"
assert ":hsilgnE" in reader.pages[0].extract_text(
visitor_text=visitor_text
), "CUSTOM_RTL_MIN/MAX failed"
set_custom_rtl(-1, -1, []) # to prevent further errors

reader = PdfReader(SAMPLE_ROOT / "015-arabic/habibi-rotated.pdf")
assert "habibi" in reader.pages[0].extract_text(visitor_text=visitor_text)
assert "حَبيبي" in reader.pages[0].extract_text(visitor_text=visitor_text)
assert "habibi" in reader.pages[1].extract_text(visitor_text=visitor_text)
assert "حَبيبي" in reader.pages[1].extract_text(visitor_text=visitor_text)
assert "habibi" in reader.pages[2].extract_text(visitor_text=visitor_text)
assert "حَبيبي" in reader.pages[2].extract_text(visitor_text=visitor_text)
assert "habibi" in reader.pages[3].extract_text(visitor_text=visitor_text)
assert "حَبيبي" in reader.pages[3].extract_text(visitor_text=visitor_text)

0 comments on commit 86502b9

Please sign in to comment.