Skip to content

Commit

Permalink
ENH: Cope with UC2 fonts in text_extraction (#1785)
Browse files Browse the repository at this point in the history
UCS2 are to be read as utf-16be

Fixes #1762
  • Loading branch information
pubpub-zz committed Apr 15, 2023
1 parent 3962c99 commit 20fbe3f
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 0 deletions.
3 changes: 3 additions & 0 deletions pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def build_char_map(
"/GB-EUC-V": "gbk", # TBC
"/GBpc-EUC-H": "gb2312", # TBC
"/GBpc-EUC-V": "gb2312", # TBC
# UCS2 in code
}


Expand Down Expand Up @@ -146,6 +147,8 @@ def parse_encoding(
encoding = charset_encoding[enc].copy()
elif enc in _predefined_cmap:
encoding = _predefined_cmap[enc]
elif "-UCS2-" in enc:
encoding = "utf-16-be"
else:
raise Exception("not found")
except Exception:
Expand Down
9 changes: 9 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,12 @@ def test_iss1533():
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader.pages[0].extract_text() # no error
assert build_char_map("/F", 200, reader.pages[0])[3]["\x01"] == "Ü"


@pytest.mark.enable_socket()
def test_ucs2(caplog):
url = "https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf"
name = "tstUCS2.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader.pages[1].extract_text() # no error
assert caplog.text == ""

0 comments on commit 20fbe3f

Please sign in to comment.