Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add page label support to PdfWriter #1558

Merged
merged 23 commits into from
Jan 19, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
b110225
ENH: Add page label support to PdfWriter
lorenzomanini Jan 15, 2023
c5fa998
Merge branch 'main' of https://github.com/py-pdf/pypdf
lorenzomanini Jan 15, 2023
39c6423
fix import, fix overlapping ranges behaviour
lorenzomanini Jan 17, 2023
4a48c29
add tests
lorenzomanini Jan 17, 2023
474b211
Merge remote-tracking branch 'upstream/main'
lorenzomanini Jan 17, 2023
9c97450
Merge branch 'main' of https://github.com/lorenzomanini/pypdf
lorenzomanini Jan 17, 2023
55ed6c8
Apply suggestions from code review: fix mypy
lorenzomanini Jan 17, 2023
dbd6bce
Apply suggestions from code review: fix mypy
lorenzomanini Jan 17, 2023
717c882
Fix mypy
lorenzomanini Jan 17, 2023
e298494
fix flake
lorenzomanini Jan 17, 2023
6977af6
Moved nums functions to _page_labels.py
lorenzomanini Jan 18, 2023
251390f
Added tests
lorenzomanini Jan 18, 2023
d56a63a
Added Docs
lorenzomanini Jan 18, 2023
64beecb
add PageLabelStyle constants
lorenzomanini Jan 18, 2023
739d117
More explicit PageLabelStyle constants
lorenzomanini Jan 18, 2023
b24b2ce
Apply suggestions from code review: fix Doc
lorenzomanini Jan 18, 2023
75fa103
fix flake8
lorenzomanini Jan 18, 2023
8c0a82e
add lorenzomanini to CONTRIBUTORS.md
lorenzomanini Jan 18, 2023
679b8f8
Merge branch 'main' into main
lorenzomanini Jan 18, 2023
cd0b81e
change set_page_label page indexing to 0 based
lorenzomanini Jan 18, 2023
c1c1357
Apply suggestions from code review: fix Doc
lorenzomanini Jan 19, 2023
ef2a6e8
Merge remote-tracking branch 'upstream/main'
lorenzomanini Jan 19, 2023
daad866
fix test
lorenzomanini Jan 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 86 additions & 1 deletion pypdf/_page_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,21 @@
aa to zz for the next 26, and so on)
"""

from typing import Iterator
from typing import (
Iterator,
Optional,
Tuple,
)

from ._protocols import PdfReaderProtocol
from ._utils import logger_warning

from .generic import (
ArrayObject,
DictionaryObject,
NumberObject,
)


def number2uppercase_roman_numeral(num: int) -> str:
roman = [
Expand Down Expand Up @@ -173,3 +183,78 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str:
)
# TODO: Implement /Kids and /Limits for number tree
return str(index + 1) # Fallback


def nums_insert(
lorenzomanini marked this conversation as resolved.
Show resolved Hide resolved
key: NumberObject,
value: DictionaryObject,
nums: ArrayObject,
) -> None:
"""
See 7.9.7 "Number Trees".
Insert a key, value pair in a Nums array.
lorenzomanini marked this conversation as resolved.
Show resolved Hide resolved

Args:
key: number key of the entry
value: value of the entry
nums: Nums array to modify
"""
if len(nums) % 2 != 0:
raise ValueError("a nums like array must have an even number of elements")
lorenzomanini marked this conversation as resolved.
Show resolved Hide resolved
lorenzomanini marked this conversation as resolved.
Show resolved Hide resolved

i = len(nums)
while i != 0 and key <= nums[i - 2]:
i = i - 2

if i < len(nums) and key == nums[i]:
nums[i + 1] = value
else:
nums.insert(i, key)
nums.insert(i + 1, value)


def nums_clear_range(
key: NumberObject,
page_index_to: int,
nums: ArrayObject,
) -> None:
"""
See 7.9.7 "Number Trees".
Removes all entries in a number tree in a range after an entry.
lorenzomanini marked this conversation as resolved.
Show resolved Hide resolved

Args:
key: number key of the entry before the range
page_index_to: The page index of the upper limit of the range
nums: Nums array to modify
"""
if len(nums) % 2 != 0:
raise ValueError("a nums like array must have an even number of elements")
if page_index_to < key:
raise ValueError("page_index_to must be greater or equal than key")

i = nums.index(key) + 2
while i < len(nums) and nums[i] <= page_index_to:
nums.pop(i)
nums.pop(i)


def nums_next(
key: NumberObject,
nums: ArrayObject,
) -> Tuple[Optional[NumberObject], Optional[DictionaryObject]]:
"""
See 7.9.7 "Number Trees".
Returns the key, value pair of the entry after the one given.
lorenzomanini marked this conversation as resolved.
Show resolved Hide resolved

Args:
key: number key of the entry
nums: Nums array
"""
if len(nums) % 2 != 0:
raise ValueError("a nums like array must have an even number of elements")

i = nums.index(key) + 2
if i < len(nums):
return (nums[i], nums[i + 1])
else:
return (None, None)
106 changes: 106 additions & 0 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@

from ._encryption import Encryption
from ._page import PageObject, _VirtualList
from ._page_labels import nums_insert, nums_clear_range, nums_next
from ._reader import PdfReader
from ._security import _alg33, _alg34, _alg35
from ._utils import (
Expand Down Expand Up @@ -88,6 +89,7 @@
from .constants import StreamAttributes as SA
from .constants import TrailerKeys as TK
from .constants import TypFitArguments, UserAccessPermissions
from .constants import PageLabelStyle
from .generic import (
PAGE_FIT,
AnnotationBuilder,
Expand Down Expand Up @@ -123,6 +125,7 @@
ZoomArgType,
)


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -2874,6 +2877,109 @@ def reset_translation(
else:
raise Exception("invalid parameter {reader}")

def set_page_label(
lorenzomanini marked this conversation as resolved.
Show resolved Hide resolved
self,
page_number_from: int,
page_number_to: int,
Copy link
Contributor Author

@lorenzomanini lorenzomanini Jan 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think about page ranges indexes.
Now set_page_label requires page numbers starting with 1 while _set_page_label requires them starting with 0 (the parameters name change accordingly from page_number to page_index)
Also in both cases extremes are included.
I did it this way in the public interface because I think it is more natural this way for the user that probably is setting these watching a pdf (where page numbers start from 1), but I understand if you don't agree, especially considering that the private interface is 0 based.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer to use page_index everywhere and hence starting from 0.
Don't forget that pypdf users are Python developers. Starting a list of pages with index 0 is a lot more natural than starting with 1.

My suggestion would be to rename the parameters to page_index_from and page_index_to and letting it start with 0.

@pubpub-zz / @MasterOdin What's your opinion?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree with your approach

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hehehe 1 based indexing is never the answer. The more I think about it the more I agree with you. I'll wait a bit for MasterOdin answer and then I'll change that. Unfortunately, I will have to change all the indexes in the test but I knew what I was risking when I did it:sweat_smile:

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would agree with making it 0-based indexed for the simple reason that all other functions that refer to pages uses a 0-based index.

I agree with @MartinThoma that using page_index is the more "proper" way to refer to this as people may conflate page_number with a 1-based index scheme, whereas page_index, in my opinion, really only refers to 0-based index schemes in the context of Python.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! I'll do that

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The last commit changes everything to 0-based indexing. Are you convinced by including both extremes or should we do lower included and upper excluded?

style: Optional[PageLabelStyle] = None,
prefix: Optional[str] = None,
start: Optional[int] = 0,
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
"""
Set a page label to a range of pages.
Page numbers must be given starting from 1.
lorenzomanini marked this conversation as resolved.
Show resolved Hide resolved
Labels must have a style, a prefix or both.
If to a range is not assigned any page label a decimal label starting from 1 is applied.

Args:
page_number_from: page number of the beginning of the range starting from 1
page_number_to: page number of the beginning of the range starting from 1
lorenzomanini marked this conversation as resolved.
Show resolved Hide resolved
style: The numbering style to be used for the numeric portion of each page label:
D Decimal arabic numerals
R Uppercase roman numerals
r Lowercase roman numerals
A Uppercase letters (A to Z for the first 26 pages, AA to ZZ for the next 26, and so on)
a Lowercase letters (a to z for the first 26 pages, aa to zz for the next 26, and so on)
lorenzomanini marked this conversation as resolved.
Show resolved Hide resolved
prefix: The label prefix for page labels in this range.
start: The value of the numeric portion for the first page label in the range.
Subsequent pages are numbered sequentially from this value, which must be greater than or equal to 1. Default value: 1.
"""
if style is None and prefix is None:
raise ValueError("at least one between style and prefix must be given")
if page_number_from < 1:
raise ValueError("page_index_from must be equal or greater then 1")
if page_number_to < page_number_from:
raise ValueError(
"page_index_to must be equal or greater then page_index_from"
)
if page_number_to > len(self.pages):
raise ValueError("page_index_to exceeds number of pages")
if start is not None and start != 0 and start < 1:
raise ValueError("if given start must be equal or greater than one")

self._set_page_label(
page_number_from - 1, page_number_to - 1, style, prefix, start
)

def _set_page_label(
self,
page_index_from: int,
page_index_to: int,
style: Optional[PageLabelStyle] = None,
prefix: Optional[str] = None,
start: Optional[int] = 0,
) -> None:
"""
Set a page label to a range of pages.
Page indexes must be given starting from 0.
Labels must have a style, a prefix or both.
If to a range is not assigned any page label a decimal label starting from 1 is applied.

Args:
page_index_from: page index of the beginning of the range starting from 0
page_index_to: page index of the beginning of the range starting from 0
style: The numbering style to be used for the numeric portion of each page label:
/D Decimal arabic numerals
/R Uppercase roman numerals
/r Lowercase roman numerals
/A Uppercase letters (A to Z for the first 26 pages, AA to ZZ for the next 26, and so on)
/a Lowercase letters (a to z for the first 26 pages, aa to zz for the next 26, and so on)
prefix: The label prefix for page labels in this range.
start: The value of the numeric portion for the first page label in the range.
Subsequent pages are numbered sequentially from this value, which must be greater than or equal to 1. Default value: 1.
"""
default_page_label = DictionaryObject()
default_page_label[NameObject("/S")] = NameObject("/D")

new_page_label = DictionaryObject()
if style is not None:
new_page_label[NameObject("/S")] = NameObject(style)
if prefix is not None:
new_page_label[NameObject("/P")] = TextStringObject(prefix)
if start != 0:
new_page_label[NameObject("/St")] = NumberObject(start)
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved

if not NameObject(CatalogDictionary.PAGE_LABELS) in self._root_object:
nums = ArrayObject()
nums_insert(NumberObject(0), default_page_label, nums)
page_labels = TreeObject()
page_labels[NameObject("/Nums")] = nums
self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

page_labels = cast(
TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]
)
nums = cast(ArrayObject, page_labels[NameObject("/Nums")])

nums_insert(NumberObject(page_index_from), new_page_label, nums)
nums_clear_range(NumberObject(page_index_from), page_index_to, nums)
next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)
if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):
nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)

page_labels[NameObject("/Nums")] = nums
self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels


def _pdf_objectify(obj: Union[Dict[str, Any], str, int, List[Any]]) -> PdfObject:
if isinstance(obj, PdfObject):
Expand Down
10 changes: 10 additions & 0 deletions pypdf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,16 @@ class OutlineFontFlag(IntFlag):
bold = 2


class PageLabelStyle:
"""Table 8.10 in the 1.7 reference."""

D = "/D" # Decimal arabics
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
LOWER_R = "/r" # Lowercase roman numbers
UPPER_R = "/R" # Uppercase roman numbers
LOWER_A = "/a" # Lowercase letters
UPPER_A = "/A" # Uppercase letters


PDF_KEYS = (
AnnotationDictionaryAttributes,
CatalogAttributes,
Expand Down
141 changes: 141 additions & 0 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1008,3 +1008,144 @@ def test_append_multiple():
pages = writer._root_object["/Pages"]["/Kids"]
assert pages[0] not in pages[1:] # page not repeated
assert pages[-1] not in pages[0:-1] # page not repeated


@pytest.mark.samples
def test_set_page_labels():
src = RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf" # File without labels
target = "pypdf-output.pdf"
reader = PdfReader(src)

expected = [
"i",
"ii",
"1",
"2",
"A",
"B",
"1",
"2",
"3",
"4",
"A",
"i",
"I",
"II",
"1",
"2",
"3",
"I",
"II",
]

# Tests full lenght with labels assigned at first and last elements
# Tests different labels assigned to consecutive ranges
writer = PdfWriter()
writer.clone_document_from_reader(reader)
writer.set_page_label(1, 2, "/r")
writer.set_page_label(5, 6, "/A")
writer.set_page_label(11, 11, "/A")
writer.set_page_label(12, 12, "/r")
writer.set_page_label(13, 14, "/R")
writer.set_page_label(18, 19, "/R")
writer.write(target)
assert PdfReader(target).page_labels == expected

writer = PdfWriter() # Same labels, different set order
writer.clone_document_from_reader(reader)
writer.set_page_label(18, 19, "/R")
writer.set_page_label(5, 6, "/A")
writer.set_page_label(11, 11, "/A")
writer.set_page_label(1, 2, "/r")
writer.set_page_label(13, 14, "/R")
writer.set_page_label(12, 12, "/r")
writer.write(target)
assert PdfReader(target).page_labels == expected

# Tests labels assigned only in the middle
# Tests label assigned to a range already containing labled ranges
expected = ["1", "2", "i", "ii", "iii", "iv", "v", "1"]
writer = PdfWriter()
writer.clone_document_from_reader(reader)
writer.set_page_label(4, 5, "/a")
writer.set_page_label(6, 6, "/A")
writer.set_page_label(3, 7, "/r")
writer.write(target)
assert PdfReader(target).page_labels[: len(expected)] == expected

# Tests labels assigned inside a previously existing range
expected = ["1", "2", "i", "a", "b", "A", "1", "1", "2"]
# Ones repeat because user didnt cover the entire original range
writer = PdfWriter()
writer.clone_document_from_reader(reader)
writer.set_page_label(3, 7, "/r")
writer.set_page_label(4, 5, "/a")
writer.set_page_label(6, 6, "/A")
writer.write(target)
assert PdfReader(target).page_labels[: len(expected)] == expected

# Tests invalid user input
writer = PdfWriter()
writer.clone_document_from_reader(reader)
with pytest.raises(
ValueError, match="at least one between style and prefix must be given"
):
writer.set_page_label(1, 6, start=2)
with pytest.raises(
ValueError, match="page_index_from must be equal or greater then 1"
):
writer.set_page_label(-1, 6, "/r")
with pytest.raises(
ValueError, match="page_index_to must be equal or greater then page_index_from"
):
writer.set_page_label(6, 1, "/r")
with pytest.raises(ValueError, match="page_index_to exceeds number of pages"):
writer.set_page_label(1, 20, "/r")
with pytest.raises(
ValueError, match="if given start must be equal or greater than one"
):
writer.set_page_label(1, 6, "/r", start=-1)

os.remove(target)

src = (
SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf"
) # File with pre existing labels
target = "pypdf-output.pdf"
reader = PdfReader(src)

# Tests adding labels to existing ones
expected = ["i", "ii", "A", "B", "1"]
writer = PdfWriter()
writer.clone_document_from_reader(reader)
writer.set_page_label(3, 4, "/A")
writer.write(target)
assert PdfReader(target).page_labels[: len(expected)] == expected

# Tests replacing existing lables
expected = ["A", "B", "1", "1", "2"]
writer = PdfWriter()
writer.clone_document_from_reader(reader)
writer.set_page_label(1, 2, "/A")
writer.write(target)
assert PdfReader(target).page_labels[: len(expected)] == expected

os.remove(target)

# Tests prefix and start.
src = RESOURCE_ROOT / "issue-604.pdf" # File without page labels
target = "page_labels_test.pdf"
reader = PdfReader(src)
writer = PdfWriter()
writer.clone_document_from_reader(reader)

writer.set_page_label(1, 1, prefix="FRONT")
writer.set_page_label(2, 3, "/D", start=2)
writer.set_page_label(4, 7, prefix="UPDATES")
writer.set_page_label(8, 11, "/D", prefix="THYR-")
writer.set_page_label(12, 22, "/D", prefix="PAP-")
writer.set_page_label(23, 31, "/D", prefix="FOLL-")
writer.set_page_label(32, 40, "/D", prefix="HURT-")
writer.write(target)

os.remove(target) # remove comment to see result