From f360bfc9126e63954bba73f0a200607aed8d4ab0 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 11 Apr 2023 23:17:43 +0200 Subject: [PATCH] ENH : cope with UC2 fonts in text_extraction fixes #1762 UCS2 are to be read as utf-16be --- pypdf/_cmap.py | 3 +++ tests/test_cmap.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 74b4f0fe1..e907b57a3 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -93,6 +93,7 @@ def build_char_map( "/GB-EUC-V": "gbk", # TBC "/GBpc-EUC-H": "gb2312", # TBC "/GBpc-EUC-V": "gb2312", # TBC + # UCS2 in code } @@ -146,6 +147,8 @@ def parse_encoding( encoding = charset_encoding[enc].copy() elif enc in _predefined_cmap: encoding = _predefined_cmap[enc] + elif "-UCS2-" in enc: + encoding = "utf-16-be" else: raise Exception("not found") except Exception: diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 666d3ecfa..a7b1b451f 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -132,3 +132,12 @@ def test_iss1533(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) reader.pages[0].extract_text() # no error assert build_char_map("/F", 200, reader.pages[0])[3]["\x01"] == "Ü" + + +@pytest.mark.enable_socket() +def test_ucs2(caplog): + url = "https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf" + name = "tstUCS2.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader.pages[1].extract_text() # no error + assert caplog.text == ""