From 755023defc58ac26edcdae78c661a976c4793814 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 25 Aug 2022 23:03:48 +0200 Subject: [PATCH 1/4] ROB : fix errors/warnings on no /resources with extract_text fix #1272 (in text) and #1269 (in Xform) --- PyPDF2/_page.py | 5 ++++- tests/test_page.py | 9 ++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 77a15ab32..a12822881 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1140,7 +1140,10 @@ def _extract_text( cmaps: Dict[ str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str]] ] = {} - resources_dict = cast(DictionaryObject, obj["/Resources"]) + try: + resources_dict = cast(DictionaryObject, obj["/Resources"]) + except Exception: + return "" # no resources means no text is possible (no font) if "/Font" in resources_dict: for f in cast(DictionaryObject, resources_dict["/Font"]): cmaps[f] = build_char_map(f, space_width, obj) diff --git a/tests/test_page.py b/tests/test_page.py index 2a9c97b00..9797e75e0 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -238,6 +238,13 @@ def test_extract_text_single_quote_op(): page.extract_text() +def test_no_ressources_on_text_extract(): + url = "https://raw.githubusercontent.com/eagletrt/wiki/0f3f16309604f665a47595c890d15af1b3aec6d6/fenice-telemetry-tx/PCB%20Outputs/Pdf/Edge%20Mount%20SMA/TelemetryTX_EM.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-964029.pdf"))) + for page in reader.pages: + page.extract_text() + + def test_iss_1142(): # check fix for problem of context save/restore (q/Q) url = "https://github.com/py-pdf/PyPDF2/files/9150656/ST.2019.PDF" @@ -280,7 +287,7 @@ def test_extract_text_page_pdf_impossible_decode_xform(caplog): for page in reader.pages: page.extract_text() warn_msgs = normalize_warnings(caplog.text) - assert warn_msgs == [" impossible to decode XFormObject /Meta203"] + assert warn_msgs == [""] # text extraction recognise no texg def test_extract_text_operator_t_star(): # L1266, L1267 From 61a306b436bedf0d50ae8110347fa73fc81de488 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 26 Aug 2022 08:02:15 +0200 Subject: [PATCH 2/4] Update tests/test_page.py Co-authored-by: Matthew Peveler --- tests/test_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_page.py b/tests/test_page.py index 9797e75e0..e74690d80 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -287,7 +287,7 @@ def test_extract_text_page_pdf_impossible_decode_xform(caplog): for page in reader.pages: page.extract_text() warn_msgs = normalize_warnings(caplog.text) - assert warn_msgs == [""] # text extraction recognise no texg + assert warn_msgs == [""] # text extraction recognise no text def test_extract_text_operator_t_star(): # L1266, L1267 From 2cb4c5f90d93e7c0d89bf82763d88cacc31f7019 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 26 Aug 2022 08:04:20 +0200 Subject: [PATCH 3/4] Update tests/test_page.py Co-authored-by: Matthew Peveler --- tests/test_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_page.py b/tests/test_page.py index e74690d80..90e40c0a5 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -239,7 +239,7 @@ def test_extract_text_single_quote_op(): def test_no_ressources_on_text_extract(): - url = "https://raw.githubusercontent.com/eagletrt/wiki/0f3f16309604f665a47595c890d15af1b3aec6d6/fenice-telemetry-tx/PCB%20Outputs/Pdf/Edge%20Mount%20SMA/TelemetryTX_EM.pdf" + url = "https://github.com/py-pdf/PyPDF2/files/9428434/TelemetryTX_EM.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-964029.pdf"))) for page in reader.pages: page.extract_text() From a4feaba0204246bec4cc733648e2bf15ec3a4747 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 27 Aug 2022 21:08:17 +0200 Subject: [PATCH 4/4] look for ressources in Parents --- PyPDF2/_page.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index a12822881..f818ff544 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1141,9 +1141,14 @@ def _extract_text( str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str]] ] = {} try: - resources_dict = cast(DictionaryObject, obj["/Resources"]) + objr = obj + while NameObject("/Resources") not in objr: + # /Resources can be inherited sometimes so we look to parents + objr = objr["/Parent"].get_object() + # if no parents we will have no /Resources will be available => an exception wil be raised + resources_dict = cast(DictionaryObject, objr["/Resources"]) except Exception: - return "" # no resources means no text is possible (no font) + return "" # no resources means no text is possible (no font) we consider the file as not damaged, no need to check for TJ or Tj if "/Font" in resources_dict: for f in cast(DictionaryObject, resources_dict["/Font"]): cmaps[f] = build_char_map(f, space_width, obj)