Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16607,6 +16607,10 @@ def extractWORDS(self, delimiters=None):
and not mupdf.fz_is_infinite_rect(tp_rect)
):
continue

if buflen == 0 and ch.m_internal.c == 0x200d:
# ZERO WIDTH JOINER cannot start a word
continue
word_delimiter = JM_is_word_delimiter(ch.m_internal.c, delimiters)
this_char_rtl = JM_is_rtl_char(ch.m_internal.c)
if word_delimiter or this_char_rtl != last_char_rtl:
Expand Down
6 changes: 5 additions & 1 deletion src/extra.i
Original file line number Diff line number Diff line change
Expand Up @@ -3295,7 +3295,11 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters)
{
continue;
}

// prevent Unicode ZWJ 0x200d to start a word
if (buflen == 0 && ch.m_internal->c == 0x200d)
{
continue;
}
int word_delimiter = JM_is_word_delimiter(ch.m_internal->c, delimiters);
int this_char_rtl = JM_is_rtl_char(ch.m_internal->c);
if (word_delimiter || this_char_rtl != last_char_rtl)
Expand Down
Binary file added tests/resources/test_4716.pdf
Binary file not shown.
15 changes: 15 additions & 0 deletions tests/test_4716.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pymupdf
import os

def test_4716():
"""Confirm that ZERO WIDTH JOINER will never start a word."""
script_dir = os.path.dirname(__file__)
filename = os.path.join(script_dir, "resources", "test_4716.pdf")
doc = pymupdf.open(filename)
expected = set(["+25.00", "Любимый", "-10.00"])
word_text = set()
for page in doc:
words = page.get_text("words")
for w in words:
word_text.add(w[4])
assert word_text == expected