diff --git a/sample-files b/sample-files index d89d531f4..d3d250321 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit d89d531f4bed7c5e692e4c094645133baf50a044 +Subproject commit d3d250321b01ca1194e16a814d24508897862fe8 diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index e67e218c4..27490cbda 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -3,9 +3,10 @@ The tested code might be in _page.py. """ - from pathlib import Path +import pytest + from pypdf import PdfReader from pypdf._text_extraction import set_custom_rtl @@ -15,9 +16,10 @@ SAMPLE_ROOT = PROJECT_ROOT / "sample-files" -def test_multi_language(): +@pytest.mark.parametrize(("visitor_text"), [None, lambda a, b, c, d, e: None]) +def test_multi_language(visitor_text): reader = PdfReader(RESOURCE_ROOT / "multilang.pdf") - txt = reader.pages[0].extract_text() + txt = reader.pages[0].extract_text(visitor_text=visitor_text) assert "Hello World" in txt, "English not correctly extracted" # iss #1296 assert "مرحبا بالعالم" in txt, "Arabic not correctly extracted" @@ -27,15 +29,29 @@ def test_multi_language(): assert "こんにちは世界" in txt, "Japanese not correctly extracted" # check customizations set_custom_rtl(None, None, "Russian:") - assert ( - ":naissuR" in reader.pages[0].extract_text() + assert ":naissuR" in reader.pages[0].extract_text( + visitor_text=visitor_text ), "(1) CUSTOM_RTL_SPECIAL_CHARS failed" set_custom_rtl(None, None, [ord(x) for x in "Russian:"]) - assert ( - ":naissuR" in reader.pages[0].extract_text() + assert ":naissuR" in reader.pages[0].extract_text( + visitor_text=visitor_text ), "(2) CUSTOM_RTL_SPECIAL_CHARS failed" set_custom_rtl(0, 255, None) - assert ":hsilgnE" in reader.pages[0].extract_text(), "CUSTOM_RTL_MIN/MAX failed" + assert ":hsilgnE" in reader.pages[0].extract_text( + visitor_text=visitor_text + ), "CUSTOM_RTL_MIN/MAX failed" set_custom_rtl("A", "z", []) - assert ":hsilgnE" in reader.pages[0].extract_text(), "CUSTOM_RTL_MIN/MAX failed" + assert ":hsilgnE" in reader.pages[0].extract_text( + visitor_text=visitor_text + ), "CUSTOM_RTL_MIN/MAX failed" set_custom_rtl(-1, -1, []) # to prevent further errors + + reader = PdfReader(SAMPLE_ROOT / "015-arabic/habibi-rotated.pdf") + assert "habibi" in reader.pages[0].extract_text(visitor_text=visitor_text) + assert "حَبيبي" in reader.pages[0].extract_text(visitor_text=visitor_text) + assert "habibi" in reader.pages[1].extract_text(visitor_text=visitor_text) + assert "حَبيبي" in reader.pages[1].extract_text(visitor_text=visitor_text) + assert "habibi" in reader.pages[2].extract_text(visitor_text=visitor_text) + assert "حَبيبي" in reader.pages[2].extract_text(visitor_text=visitor_text) + assert "habibi" in reader.pages[3].extract_text(visitor_text=visitor_text) + assert "حَبيبي" in reader.pages[3].extract_text(visitor_text=visitor_text)