diff --git a/src/__init__.py b/src/__init__.py index c587c4977..143833714 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -16607,6 +16607,10 @@ def extractWORDS(self, delimiters=None): and not mupdf.fz_is_infinite_rect(tp_rect) ): continue + + if buflen == 0 and ch.m_internal.c == 0x200d: + # ZERO WIDTH JOINER cannot start a word + continue word_delimiter = JM_is_word_delimiter(ch.m_internal.c, delimiters) this_char_rtl = JM_is_rtl_char(ch.m_internal.c) if word_delimiter or this_char_rtl != last_char_rtl: diff --git a/src/extra.i b/src/extra.i index 986c88735..9d448dac1 100644 --- a/src/extra.i +++ b/src/extra.i @@ -3295,7 +3295,11 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters) { continue; } - + // prevent Unicode ZWJ 0x200d to start a word + if (buflen == 0 && ch.m_internal->c == 0x200d) + { + continue; + } int word_delimiter = JM_is_word_delimiter(ch.m_internal->c, delimiters); int this_char_rtl = JM_is_rtl_char(ch.m_internal->c); if (word_delimiter || this_char_rtl != last_char_rtl) diff --git a/tests/resources/test_4716.pdf b/tests/resources/test_4716.pdf new file mode 100644 index 000000000..25a9a57f5 Binary files /dev/null and b/tests/resources/test_4716.pdf differ diff --git a/tests/test_4716.py b/tests/test_4716.py new file mode 100644 index 000000000..09692277a --- /dev/null +++ b/tests/test_4716.py @@ -0,0 +1,15 @@ +import pymupdf +import os + +def test_4716(): + """Confirm that ZERO WIDTH JOINER will never start a word.""" + script_dir = os.path.dirname(__file__) + filename = os.path.join(script_dir, "resources", "test_4716.pdf") + doc = pymupdf.open(filename) + expected = set(["+25.00", "Любимый", "-10.00"]) + word_text = set() + for page in doc: + words = page.get_text("words") + for w in words: + word_text.add(w[4]) + assert word_text == expected