Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: Add root_object, _info and _ID to PdfReader #2495

Merged
merged 3 commits into from
Mar 3, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 7 additions & 7 deletions pypdf/_page_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
Example 1
---------

>>> reader.trailer["/Root"]["/PageLabels"]["/Nums"]
>>> reader.root_object["/PageLabels"]["/Nums"]
[0, IndirectObject(18, 0, 139929798197504),
8, IndirectObject(19, 0, 139929798197504)]
>>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][1])
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
{'/S': '/r'}
>>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][3])
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
{'/S': '/D'}

Example 2
Expand Down Expand Up @@ -57,7 +57,7 @@
aa to zz for the next 26, and so on)
"""

from typing import Iterator, Optional, Tuple
from typing import Iterator, Optional, Tuple, cast

from ._protocols import PdfReaderProtocol
from ._utils import logger_warning
Expand Down Expand Up @@ -127,10 +127,10 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str:
Returns:
The label of the page, e.g. "iv" or "4".
"""
root = reader.trailer["/Root"]
root = cast(DictionaryObject, reader.root_object)
if "/PageLabels" not in root:
return str(index + 1) # Fallback
number_tree = root["/PageLabels"]
number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
if "/Nums" in number_tree:
# [Nums] shall be an array of the form
# [ key 1 value 1 key 2 value 2 ... key n value n ]
Expand All @@ -139,7 +139,7 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str:
# The keys shall be sorted in numerical order,
# analogously to the arrangement of keys in a name tree
# as described in 7.9.6, "Name Trees."
nums = number_tree["/Nums"]
nums = cast(ArrayObject, number_tree["/Nums"])
i = 0
value = None
start_index = 0
Expand Down
8 changes: 8 additions & 0 deletions pypdf/_protocols.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ def pages(self) -> List[Any]:
def trailer(self) -> Dict[str, Any]:
...

@property
def root_object(self) -> PdfObjectProtocol:
...

def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
...

Expand All @@ -67,6 +71,10 @@ class PdfWriterProtocol(Protocol): # deprecated
_objects: List[Any]
_id_translated: Dict[int, Dict[int, int]]

@property
def root_object(self) -> PdfObjectProtocol:
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
...

def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
...

Expand Down
71 changes: 47 additions & 24 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,7 @@ class PdfReader:
@property
def viewer_preferences(self) -> Optional[ViewerPreferences]:
"""Returns the existing ViewerPreferences as an overloaded dictionary."""
o = cast(DictionaryObject, self.trailer["/Root"]).get(
CD.VIEWER_PREFERENCES, None
)
o = self.root_object.get(CD.VIEWER_PREFERENCES, None)
if o is None:
return None
o = o.get_object()
Expand Down Expand Up @@ -344,6 +342,33 @@ def __init__(
elif password is not None:
raise PdfReadError("Not encrypted file")

@property
def root_object(self) -> DictionaryObject:
"""Provide access to "/Root". standardized with PdfWriter."""
return cast(DictionaryObject, self.trailer[TK.ROOT].get_object())

@property
def _info(self) -> Optional[DictionaryObject]:
"""
Provide access to "/Info". standardized with PdfWriter.

Returns:
/Info Dictionary ; None if the entry does not exists
"""
info = self.trailer.get(TK.INFO, None)
return None if info is None else cast(DictionaryObject, info.get_object())

@property
def _ID(self) -> Optional[ArrayObject]:
"""
Provide access to "/ID". standardized with PdfWriter.

Returns:
/ID array ; None if the entry does not exists
"""
id = self.trailer.get(TK.ID, None)
return None if id is None else cast(ArrayObject, id.get_object())

def _repr_mimebundle_(
self,
include: Union[None, Iterable[str]] = None,
Expand Down Expand Up @@ -400,21 +425,20 @@ def metadata(self) -> Optional[DocumentInformation]:
"""
if TK.INFO not in self.trailer:
return None
obj = self.trailer[TK.INFO]
retval = DocumentInformation()
if isinstance(obj, type(None)):
if isinstance(self._info, type(None)):
raise PdfReadError(
"trailer not found or does not point to document information directory"
)
retval.update(obj) # type: ignore
retval.update(self._info) # type: ignore
return retval

@property
def xmp_metadata(self) -> Optional[XmpInformation]:
"""XMP (Extensible Metadata Platform) data."""
try:
self._override_encryption = True
return self.trailer[TK.ROOT].xmp_metadata # type: ignore
return self.root_object.xmp_metadata # type: ignore
finally:
self._override_encryption = False

Expand All @@ -433,7 +457,7 @@ def _get_num_pages(self) -> int:
# the PDF file's page count is used in this case. Otherwise,
# the original method (flattened page count) is used.
if self.is_encrypted:
return self.trailer[TK.ROOT]["/Pages"]["/Count"] # type: ignore
return self.root_object["/Pages"]["/Count"] # type: ignore
else:
if self.flattened_pages is None:
self._flatten()
Expand Down Expand Up @@ -493,7 +517,7 @@ def get_fields(
field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict())
if retval is None:
retval = {}
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object
# get the AcroForm tree
if CD.ACRO_FORM in catalog:
tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])
Expand Down Expand Up @@ -755,7 +779,7 @@ def _get_named_destinations(
"""
if retval is None:
retval = {}
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object

# get the name tree
if CA.DESTS in catalog:
Expand Down Expand Up @@ -822,7 +846,7 @@ def _get_outline(
) -> OutlineType:
if outline is None:
outline = []
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object

# get the outline dictionary and named destinations
if CO.OUTLINES in catalog:
Expand Down Expand Up @@ -868,7 +892,7 @@ def threads(self) -> Optional[ArrayObject]:
It's an array of dictionaries with "/F" and "/I" properties or
None if there are no articles.
"""
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object
if CO.THREADS in catalog:
return cast("ArrayObject", catalog[CO.THREADS])
else:
Expand Down Expand Up @@ -1071,9 +1095,8 @@ def page_layout(self) -> Optional[str]:
* - /TwoPageRight
- Show two pages at a time, odd-numbered pages on the right
"""
trailer = cast(DictionaryObject, self.trailer[TK.ROOT])
if CD.PAGE_LAYOUT in trailer:
return cast(NameObject, trailer[CD.PAGE_LAYOUT])
if CD.PAGE_LAYOUT in self.root_object:
return cast(NameObject, self.root_object[CD.PAGE_LAYOUT])
return None

@property
Expand All @@ -1098,7 +1121,7 @@ def page_mode(self) -> Optional[PagemodeType]:
- Show attachments panel
"""
try:
return self.trailer[TK.ROOT]["/PageMode"] # type: ignore
return self.root_object["/PageMode"] # type: ignore
except KeyError:
return None

Expand All @@ -1119,12 +1142,12 @@ def _flatten(
if pages is None:
# Fix issue 327: set flattened_pages attribute only for
# decrypted file
catalog = self.trailer[TK.ROOT].get_object()
pages = catalog["/Pages"].get_object() # type: ignore
catalog = self.root_object
pages = cast(DictionaryObject, catalog["/Pages"].get_object())
self.flattened_pages = []

if PA.TYPE in pages:
t = pages[PA.TYPE]
t = cast(str, pages[PA.TYPE])
# if pdf has no type, considered as a page if /Kids is missing
elif PA.KIDS not in pages:
t = "/Page"
Expand Down Expand Up @@ -1925,7 +1948,7 @@ def is_encrypted(self) -> bool:
def xfa(self) -> Optional[Dict[str, Any]]:
tree: Optional[TreeObject] = None
retval: Dict[str, Any] = {}
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object

if "/AcroForm" not in catalog or not catalog["/AcroForm"]:
return None
Expand Down Expand Up @@ -1955,7 +1978,7 @@ def add_form_topname(self, name: str) -> Optional[DictionaryObject]:
Returns:
The created object. ``None`` means no object was created.
"""
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object

if "/AcroForm" not in catalog or not isinstance(
catalog["/AcroForm"], DictionaryObject
Expand Down Expand Up @@ -1997,7 +2020,7 @@ def rename_form_topname(self, name: str) -> Optional[DictionaryObject]:
Returns:
The modified object. ``None`` means no object was modified.
"""
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object

if "/AcroForm" not in catalog or not isinstance(
catalog["/AcroForm"], DictionaryObject
Expand Down Expand Up @@ -2030,7 +2053,7 @@ def _list_attachments(self) -> List[str]:
Returns:
list of filenames
"""
catalog = cast(DictionaryObject, self.trailer["/Root"])
catalog = self.root_object
# From the catalog get the embedded file names
try:
filenames = cast(
Expand Down Expand Up @@ -2068,7 +2091,7 @@ def _get_attachments(
dictionary of filename -> Union[bytestring or List[ByteString]]
if the filename exists multiple times a List of the different version will be provided
"""
catalog = cast(DictionaryObject, self.trailer["/Root"])
catalog = self.root_object
# From the catalog get the embedded file names
try:
filenames = cast(
Expand Down
24 changes: 17 additions & 7 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,16 @@ def __init__(
self._encrypt_entry: Optional[DictionaryObject] = None
self._ID: Union[ArrayObject, None] = None

@property
def root_object(self) -> DictionaryObject:
"""
Provide direct access to Pdf Structure

Note:
Recommended be used only for read access
"""
return self._root_object

def __enter__(self) -> "PdfWriter":
"""Store that writer is initialized by 'with'."""
self.with_as_usage = True
Expand Down Expand Up @@ -1084,7 +1094,7 @@ def clone_reader_document_root(self, reader: PdfReader) -> None:
reader: PdfReader from the document root should be copied.
"""
self._objects.clear()
self._root_object = cast(DictionaryObject, reader.trailer[TK.ROOT].clone(self))
self._root_object = reader.root_object.clone(self)
self._root = self._root_object.indirect_reference # type: ignore[assignment]
self._pages = self._root_object.raw_get("/Pages")
self._flatten()
Expand Down Expand Up @@ -1165,10 +1175,10 @@ def clone_document_from_reader(
"""
self.clone_reader_document_root(reader)
if TK.INFO in reader.trailer:
self._info = reader.trailer[TK.INFO].clone(self).indirect_reference # type: ignore
self._info = reader._info.clone(self).indirect_reference # type: ignore
try:
self._ID = cast(ArrayObject, reader.trailer[TK.ID].clone(self))
except KeyError:
self._ID = cast(ArrayObject, reader._ID).clone(self)
except AttributeError:
pass
if callable(after_page_append):
for page in cast(
Expand Down Expand Up @@ -2546,7 +2556,7 @@ def merge(
else:
outline_item_typ = self.get_outline_root()

_ro = cast("DictionaryObject", reader.trailer[TK.ROOT])
_ro = reader.root_object
if import_outline and CO.OUTLINES in _ro:
outline = self._get_filtered_outline(
_ro.get(CO.OUTLINES, None), srcpages, reader
Expand All @@ -2569,7 +2579,7 @@ def merge(
self._root_object[NameObject("/AcroForm")] = self._add_object(
cast(
DictionaryObject,
cast(DictionaryObject, reader.trailer["/Root"])["/AcroForm"],
reader.root_object["/AcroForm"],
).clone(self, False, ("/Fields",))
)
arr = ArrayObject()
Expand All @@ -2580,7 +2590,7 @@ def merge(
)
trslat = self._id_translated[id(reader)]
try:
for f in reader.trailer["/Root"]["/AcroForm"]["/Fields"]: # type: ignore
for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore
try:
ind = IndirectObject(trslat[f.idnum], 0, self)
if ind not in arr:
Expand Down