From 0b01b7f7306beb5cf6d93d9f6dcfd7e00c508e01 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 18 Jan 2023 23:06:16 +0100 Subject: [PATCH 1/4] remove erroneous assertion check closes #1559 This is due to a bad interpretation of text at bottom of page 108 of pdf 1.7 reference --- pypdf/_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 21520ed8f..0ac387b5d 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -1904,7 +1904,7 @@ def _read_xref_subsections( last_end = 0 for start, size in self._pairs(idx_pairs): # The subsections must increase - assert start >= last_end + # assert start >= last_end last_end = start + size for num in range(start, start + size): # The first entry is the type From d39988acff000eae4baf94f36baa17ebd71ced59 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 18 Jan 2023 23:11:12 +0100 Subject: [PATCH 2/4] flake8 --- pypdf/_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 0ac387b5d..4d8837aa6 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -1905,7 +1905,7 @@ def _read_xref_subsections( for start, size in self._pairs(idx_pairs): # The subsections must increase # assert start >= last_end - last_end = start + size + # last_end = start + size for num in range(start, start + size): # The first entry is the type xref_type = get_entry(0) From b7f7a199910ae641a0cb1e59dfb52d69bd39ff04 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 20 Jan 2023 18:21:22 +0100 Subject: [PATCH 3/4] flake8 --- pypdf/_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 4d8837aa6..4ec380b88 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -1901,7 +1901,7 @@ def _read_xref_subsections( get_entry: Callable[[int], Union[int, Tuple[int, ...]]], used_before: Callable[[int, Union[int, Tuple[int, ...]]], bool], ) -> None: - last_end = 0 + # last_end = 0 for start, size in self._pairs(idx_pairs): # The subsections must increase # assert start >= last_end From 222a7820716c21f762f5a9a4a118409bfeff8c75 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 20 Jan 2023 18:42:18 +0100 Subject: [PATCH 4/4] add test --- tests/test_reader.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_reader.py b/tests/test_reader.py index 62eb7b7bd..81dee1f0e 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1279,3 +1279,11 @@ def test_build_outline_item(caplog): def test_page_labels(src, page_labels): max_indices = 6 assert PdfReader(src).page_labels[:max_indices] == page_labels[:max_indices] + + +def test_iss1559(): + url = "https://github.com/py-pdf/pypdf/files/10441992/default.pdf" + name = "iss1559.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + for p in reader.pages: + p.extract_text()