Skip to content

Commit

Permalink
ENH: Add support for /Kids in page labels (#2562)
Browse files Browse the repository at this point in the history
* ENH: Add support for /Kids in page labels

---------

Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
  • Loading branch information
stefan6419846 and pubpub-zz committed Apr 3, 2024
1 parent 4bdca16 commit bc29901
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 44 deletions.
112 changes: 68 additions & 44 deletions pypdf/_page_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,11 @@
aa to zz for the next 26, and so on)
"""

from typing import Iterator, Optional, Tuple, cast
from typing import Iterator, List, Optional, Tuple, cast

from ._protocols import PdfCommonDocProtocol
from ._utils import logger_warning
from .generic import ArrayObject, DictionaryObject, NumberObject
from .generic import ArrayObject, DictionaryObject, NullObject, NumberObject


def number2uppercase_roman_numeral(num: int) -> str:
Expand Down Expand Up @@ -116,6 +116,42 @@ def number2lowercase_letter(number: int) -> str:
return number2uppercase_letter(number).lower()


def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str:
# [Nums] shall be an array of the form
# [ key 1 value 1 key 2 value 2 ... key n value n ]
# where each key_i is an integer and the corresponding
# value_i shall be the object associated with that key.
# The keys shall be sorted in numerical order,
# analogously to the arrangement of keys in a name tree
# as described in 7.9.6, "Name Trees."
nums = cast(ArrayObject, dictionary_object["/Nums"])
i = 0
value = None
start_index = 0
while i < len(nums):
start_index = nums[i]
value = nums[i + 1].get_object()
if i + 2 == len(nums):
break
if nums[i + 2] > index:
break
i += 2
m = {
None: lambda n: "",
"/D": lambda n: str(n),
"/R": number2uppercase_roman_numeral,
"/r": number2lowercase_roman_numeral,
"/A": number2uppercase_letter,
"/a": number2lowercase_letter,
}
# if /Nums array is not following the specification or if /Nums is empty
if not isinstance(value, dict):
return str(index + 1) # Fallback
start = value.get("/St", 1)
prefix = value.get("/P", "")
return prefix + m[value.get("/S")](index - start_index + start)


def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
"""
See 7.9.7 "Number Trees".
Expand All @@ -132,49 +168,37 @@ def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
return str(index + 1) # Fallback
number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
if "/Nums" in number_tree:
# [Nums] shall be an array of the form
# [ key 1 value 1 key 2 value 2 ... key n value n ]
# where each key_i is an integer and the corresponding
# value_i shall be the object associated with that key.
# The keys shall be sorted in numerical order,
# analogously to the arrangement of keys in a name tree
# as described in 7.9.6, "Name Trees."
nums = cast(ArrayObject, number_tree["/Nums"])
i = 0
value = None
start_index = 0
while i < len(nums):
start_index = nums[i]
value = nums[i + 1].get_object()
if i + 2 == len(nums):
return get_label_from_nums(number_tree, index)
if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject):
# number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]}
# Limit maximum depth.
level = 0
while level < 100:
kids = cast(List[DictionaryObject], number_tree["/Kids"])
for kid in kids:
# kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
limits = cast(List[int], kid["/Limits"])
if limits[0] <= index <= limits[1]:
if kid.get("/Kids", None) is not None:
# Recursive definition.
level += 1
if level == 100: # pragma: no cover
raise NotImplementedError("Too deep nesting is not supported.")
number_tree = kid
# Exit the inner `for` loop and continue at the next level with the
# next iteration of the `while` loop.
break
return get_label_from_nums(kid, index)
else:
# When there are no kids, make sure to exit the `while` loop directly
# and continue with the fallback.
break
if nums[i + 2] > index:
break
i += 2
m = {
None: lambda n: "",
"/D": lambda n: str(n),
"/R": number2uppercase_roman_numeral,
"/r": number2lowercase_roman_numeral,
"/A": number2uppercase_letter,
"/a": number2lowercase_letter,
}
# if /Nums array is not following the specification or if /Nums is empty
if not isinstance(value, dict):
return str(index + 1) # Fallback
start = value.get("/St", 1)
prefix = value.get("/P", "")
return prefix + m[value.get("/S")](index - start_index + start)
if "/Kids" in number_tree or "/Limits" in number_tree:
logger_warning(
(
"/Kids or /Limits found in PageLabels. "
"This is not yet supported."
),
__name__,
)
# TODO: Implement /Kids and /Limits for number tree
return str(index + 1) # Fallback if /Nums is not in the number_tree

logger_warning(
f"Could not reliably determine page label for {index}.",
__name__
)
return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree


def nums_insert(
Expand Down
51 changes: 51 additions & 0 deletions tests/test_page_labels.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Test the pypdf._page_labels module."""
from io import BytesIO
from pathlib import Path

import pytest

from pypdf import PdfReader
from pypdf._page_labels import (
get_label_from_nums,
index2label,
number2lowercase_letter,
number2lowercase_roman_numeral,
Expand All @@ -15,6 +17,7 @@
nums_next,
)
from pypdf.generic import (
ArrayObject,
DictionaryObject,
NameObject,
NullObject,
Expand All @@ -23,6 +26,10 @@

from . import get_data_from_url

TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT.parent
RESOURCE_ROOT = PROJECT_ROOT / "resources"


@pytest.mark.parametrize(
("number", "expected"),
Expand Down Expand Up @@ -103,3 +110,47 @@ def test_index2label(caplog):
r.trailer["/Root"]["/PageLabels"][NameObject("/Kids")] = NullObject()
assert index2label(r, 1) == "2"
assert caplog.text != ""


@pytest.mark.enable_socket()
def test_index2label_kids():
url = "https://www.bk.admin.ch/dam/bk/de/dokumente/terminologie/publikation_25_jahre_rtd.pdf.download.pdf/Terminologie_Epochen,%20Schwerpunkte,%20Umsetzungen.pdf" # noqa: E501
r = PdfReader(BytesIO(get_data_from_url(url=url, name="index2label_kids.pdf")))
expected = [
"C1",
"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X",
"XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII",
] + list(map(str, range(1, 284)))
for x in ["20", "44", "58", "82", "94", "116", "154", "166", "192", "224", "250"]:
# Some page labels are unused. Removing them is still easier than copying the
# whole list itself here.
expected.remove(x)
assert r.page_labels == expected


@pytest.mark.enable_socket()
def test_index2label_kids__recursive(caplog):
url = "https://github.com/py-pdf/pypdf/files/14842446/tt1.pdf"
r = PdfReader(BytesIO(get_data_from_url(url=url, name="index2label_kids_recursive.pdf")))
expected = [
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L",
"M", "N", "O", "P", "17", "18", "19"
]
assert r.page_labels == expected
assert caplog.text != ""


def test_get_label_from_nums__empty_nums_list():
dictionary_object = DictionaryObject()
dictionary_object[NameObject("/Nums")] = ArrayObject()
assert get_label_from_nums(dictionary_object, 13) == "14"


def test_index2label__empty_kids_list():
reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
number_tree = DictionaryObject()
number_tree[NameObject("/Kids")] = ArrayObject()
root = reader.root_object
root[NameObject("/PageLabels")] = number_tree

assert index2label(reader, 42) == "43"

0 comments on commit bc29901

Please sign in to comment.