From f360bfc9126e63954bba73f0a200607aed8d4ab0 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Tue, 11 Apr 2023 23:17:43 +0200
Subject: [PATCH] ENH : cope with UC2 fonts in text_extraction

fixes #1762
UCS2 are to be read as utf-16be
---
 pypdf/_cmap.py     | 3 +++
 tests/test_cmap.py | 9 +++++++++
 2 files changed, 12 insertions(+)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 74b4f0fe1..e907b57a3 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -93,6 +93,7 @@ def build_char_map(
     "/GB-EUC-V": "gbk",  # TBC
     "/GBpc-EUC-H": "gb2312",  # TBC
     "/GBpc-EUC-V": "gb2312",  # TBC
+    # UCS2 in code
 }
 
 
@@ -146,6 +147,8 @@ def parse_encoding(
                 encoding = charset_encoding[enc].copy()
             elif enc in _predefined_cmap:
                 encoding = _predefined_cmap[enc]
+            elif "-UCS2-" in enc:
+                encoding = "utf-16-be"
             else:
                 raise Exception("not found")
         except Exception:
diff --git a/tests/test_cmap.py b/tests/test_cmap.py
index 666d3ecfa..a7b1b451f 100644
--- a/tests/test_cmap.py
+++ b/tests/test_cmap.py
@@ -132,3 +132,12 @@ def test_iss1533():
     reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
     reader.pages[0].extract_text()  # no error
     assert build_char_map("/F", 200, reader.pages[0])[3]["\x01"] == "Ü"
+
+
+@pytest.mark.enable_socket()
+def test_ucs2(caplog):
+    url = "https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf"
+    name = "tstUCS2.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+    reader.pages[1].extract_text()  # no error
+    assert caplog.text == ""