diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 77a15ab32..f818ff544 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1140,7 +1140,15 @@ def _extract_text( cmaps: Dict[ str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str]] ] = {} - resources_dict = cast(DictionaryObject, obj["/Resources"]) + try: + objr = obj + while NameObject("/Resources") not in objr: + # /Resources can be inherited sometimes so we look to parents + objr = objr["/Parent"].get_object() + # if no parents we will have no /Resources will be available => an exception wil be raised + resources_dict = cast(DictionaryObject, objr["/Resources"]) + except Exception: + return "" # no resources means no text is possible (no font) we consider the file as not damaged, no need to check for TJ or Tj if "/Font" in resources_dict: for f in cast(DictionaryObject, resources_dict["/Font"]): cmaps[f] = build_char_map(f, space_width, obj) diff --git a/tests/test_page.py b/tests/test_page.py index e9f3ea721..40906bd3e 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -238,6 +238,13 @@ def test_extract_text_single_quote_op(): page.extract_text() +def test_no_ressources_on_text_extract(): + url = "https://github.com/py-pdf/PyPDF2/files/9428434/TelemetryTX_EM.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-964029.pdf"))) + for page in reader.pages: + page.extract_text() + + def test_iss_1142(): # check fix for problem of context save/restore (q/Q) url = "https://github.com/py-pdf/PyPDF2/files/9150656/ST.2019.PDF" @@ -285,7 +292,7 @@ def test_extract_text_page_pdf_impossible_decode_xform(caplog): for page in reader.pages: page.extract_text() warn_msgs = normalize_warnings(caplog.text) - assert warn_msgs == [" impossible to decode XFormObject /Meta203"] + assert warn_msgs == [""] # text extraction recognise no text def test_extract_text_operator_t_star(): # L1266, L1267