Skip to content

Commit

Permalink
MAINT: Add root_object, _info and _ID to PdfReader (#2495)
Browse files Browse the repository at this point in the history
Use common functionality between PdfReader and PdfWriter.
  • Loading branch information
pubpub-zz committed Mar 3, 2024
1 parent 6cf47c5 commit fb1f5df
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 63 deletions.
14 changes: 7 additions & 7 deletions pypdf/_page_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
Example 1
---------
>>> reader.trailer["/Root"]["/PageLabels"]["/Nums"]
>>> reader.root_object["/PageLabels"]["/Nums"]
[0, IndirectObject(18, 0, 139929798197504),
8, IndirectObject(19, 0, 139929798197504)]
>>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][1])
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
{'/S': '/r'}
>>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][3])
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
{'/S': '/D'}
Example 2
Expand Down Expand Up @@ -57,7 +57,7 @@
aa to zz for the next 26, and so on)
"""

from typing import Iterator, Optional, Tuple
from typing import Iterator, Optional, Tuple, cast

from ._protocols import PdfReaderProtocol
from ._utils import logger_warning
Expand Down Expand Up @@ -127,10 +127,10 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str:
Returns:
The label of the page, e.g. "iv" or "4".
"""
root = reader.trailer["/Root"]
root = cast(DictionaryObject, reader.root_object)
if "/PageLabels" not in root:
return str(index + 1) # Fallback
number_tree = root["/PageLabels"]
number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
if "/Nums" in number_tree:
# [Nums] shall be an array of the form
# [ key 1 value 1 key 2 value 2 ... key n value n ]
Expand All @@ -139,7 +139,7 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str:
# The keys shall be sorted in numerical order,
# analogously to the arrangement of keys in a name tree
# as described in 7.9.6, "Name Trees."
nums = number_tree["/Nums"]
nums = cast(ArrayObject, number_tree["/Nums"])
i = 0
value = None
start_index = 0
Expand Down
8 changes: 8 additions & 0 deletions pypdf/_protocols.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ def pages(self) -> List[Any]:
def trailer(self) -> Dict[str, Any]:
...

@property
def root_object(self) -> PdfObjectProtocol:
...

def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
...

Expand All @@ -67,6 +71,10 @@ class PdfWriterProtocol(Protocol): # deprecated
_objects: List[Any]
_id_translated: Dict[int, Dict[int, int]]

@property
def root_object(self) -> PdfObjectProtocol:
...

def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
...

Expand Down
71 changes: 47 additions & 24 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,7 @@ class PdfReader:
@property
def viewer_preferences(self) -> Optional[ViewerPreferences]:
"""Returns the existing ViewerPreferences as an overloaded dictionary."""
o = cast(DictionaryObject, self.trailer["/Root"]).get(
CD.VIEWER_PREFERENCES, None
)
o = self.root_object.get(CD.VIEWER_PREFERENCES, None)
if o is None:
return None
o = o.get_object()
Expand Down Expand Up @@ -344,6 +342,33 @@ def __init__(
elif password is not None:
raise PdfReadError("Not encrypted file")

@property
def root_object(self) -> DictionaryObject:
"""Provide access to "/Root". standardized with PdfWriter."""
return cast(DictionaryObject, self.trailer[TK.ROOT].get_object())

@property
def _info(self) -> Optional[DictionaryObject]:
"""
Provide access to "/Info". standardized with PdfWriter.
Returns:
/Info Dictionary ; None if the entry does not exists
"""
info = self.trailer.get(TK.INFO, None)
return None if info is None else cast(DictionaryObject, info.get_object())

@property
def _ID(self) -> Optional[ArrayObject]:
"""
Provide access to "/ID". standardized with PdfWriter.
Returns:
/ID array ; None if the entry does not exists
"""
id = self.trailer.get(TK.ID, None)
return None if id is None else cast(ArrayObject, id.get_object())

def _repr_mimebundle_(
self,
include: Union[None, Iterable[str]] = None,
Expand Down Expand Up @@ -400,21 +425,20 @@ def metadata(self) -> Optional[DocumentInformation]:
"""
if TK.INFO not in self.trailer:
return None
obj = self.trailer[TK.INFO]
retval = DocumentInformation()
if isinstance(obj, type(None)):
if isinstance(self._info, type(None)):
raise PdfReadError(
"trailer not found or does not point to document information directory"
)
retval.update(obj) # type: ignore
retval.update(self._info) # type: ignore
return retval

@property
def xmp_metadata(self) -> Optional[XmpInformation]:
"""XMP (Extensible Metadata Platform) data."""
try:
self._override_encryption = True
return self.trailer[TK.ROOT].xmp_metadata # type: ignore
return self.root_object.xmp_metadata # type: ignore
finally:
self._override_encryption = False

Expand All @@ -433,7 +457,7 @@ def _get_num_pages(self) -> int:
# the PDF file's page count is used in this case. Otherwise,
# the original method (flattened page count) is used.
if self.is_encrypted:
return self.trailer[TK.ROOT]["/Pages"]["/Count"] # type: ignore
return self.root_object["/Pages"]["/Count"] # type: ignore
else:
if self.flattened_pages is None:
self._flatten()
Expand Down Expand Up @@ -493,7 +517,7 @@ def get_fields(
field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict())
if retval is None:
retval = {}
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object
# get the AcroForm tree
if CD.ACRO_FORM in catalog:
tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])
Expand Down Expand Up @@ -755,7 +779,7 @@ def _get_named_destinations(
"""
if retval is None:
retval = {}
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object

# get the name tree
if CA.DESTS in catalog:
Expand Down Expand Up @@ -822,7 +846,7 @@ def _get_outline(
) -> OutlineType:
if outline is None:
outline = []
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object

# get the outline dictionary and named destinations
if CO.OUTLINES in catalog:
Expand Down Expand Up @@ -868,7 +892,7 @@ def threads(self) -> Optional[ArrayObject]:
It's an array of dictionaries with "/F" and "/I" properties or
None if there are no articles.
"""
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object
if CO.THREADS in catalog:
return cast("ArrayObject", catalog[CO.THREADS])
else:
Expand Down Expand Up @@ -1071,9 +1095,8 @@ def page_layout(self) -> Optional[str]:
* - /TwoPageRight
- Show two pages at a time, odd-numbered pages on the right
"""
trailer = cast(DictionaryObject, self.trailer[TK.ROOT])
if CD.PAGE_LAYOUT in trailer:
return cast(NameObject, trailer[CD.PAGE_LAYOUT])
if CD.PAGE_LAYOUT in self.root_object:
return cast(NameObject, self.root_object[CD.PAGE_LAYOUT])
return None

@property
Expand All @@ -1098,7 +1121,7 @@ def page_mode(self) -> Optional[PagemodeType]:
- Show attachments panel
"""
try:
return self.trailer[TK.ROOT]["/PageMode"] # type: ignore
return self.root_object["/PageMode"] # type: ignore
except KeyError:
return None

Expand All @@ -1119,12 +1142,12 @@ def _flatten(
if pages is None:
# Fix issue 327: set flattened_pages attribute only for
# decrypted file
catalog = self.trailer[TK.ROOT].get_object()
pages = catalog["/Pages"].get_object() # type: ignore
catalog = self.root_object
pages = cast(DictionaryObject, catalog["/Pages"].get_object())
self.flattened_pages = []

if PA.TYPE in pages:
t = pages[PA.TYPE]
t = cast(str, pages[PA.TYPE])
# if pdf has no type, considered as a page if /Kids is missing
elif PA.KIDS not in pages:
t = "/Page"
Expand Down Expand Up @@ -1925,7 +1948,7 @@ def is_encrypted(self) -> bool:
def xfa(self) -> Optional[Dict[str, Any]]:
tree: Optional[TreeObject] = None
retval: Dict[str, Any] = {}
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object

if "/AcroForm" not in catalog or not catalog["/AcroForm"]:
return None
Expand Down Expand Up @@ -1955,7 +1978,7 @@ def add_form_topname(self, name: str) -> Optional[DictionaryObject]:
Returns:
The created object. ``None`` means no object was created.
"""
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object

if "/AcroForm" not in catalog or not isinstance(
catalog["/AcroForm"], DictionaryObject
Expand Down Expand Up @@ -1997,7 +2020,7 @@ def rename_form_topname(self, name: str) -> Optional[DictionaryObject]:
Returns:
The modified object. ``None`` means no object was modified.
"""
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
catalog = self.root_object

if "/AcroForm" not in catalog or not isinstance(
catalog["/AcroForm"], DictionaryObject
Expand Down Expand Up @@ -2030,7 +2053,7 @@ def _list_attachments(self) -> List[str]:
Returns:
list of filenames
"""
catalog = cast(DictionaryObject, self.trailer["/Root"])
catalog = self.root_object
# From the catalog get the embedded file names
try:
filenames = cast(
Expand Down Expand Up @@ -2068,7 +2091,7 @@ def _get_attachments(
dictionary of filename -> Union[bytestring or List[ByteString]]
if the filename exists multiple times a List of the different version will be provided
"""
catalog = cast(DictionaryObject, self.trailer["/Root"])
catalog = self.root_object
# From the catalog get the embedded file names
try:
filenames = cast(
Expand Down
24 changes: 17 additions & 7 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,16 @@ def __init__(
self._encrypt_entry: Optional[DictionaryObject] = None
self._ID: Union[ArrayObject, None] = None

@property
def root_object(self) -> DictionaryObject:
"""
Provide direct access to Pdf Structure
Note:
Recommended be used only for read access
"""
return self._root_object

def __enter__(self) -> "PdfWriter":
"""Store that writer is initialized by 'with'."""
self.with_as_usage = True
Expand Down Expand Up @@ -1084,7 +1094,7 @@ def clone_reader_document_root(self, reader: PdfReader) -> None:
reader: PdfReader from the document root should be copied.
"""
self._objects.clear()
self._root_object = cast(DictionaryObject, reader.trailer[TK.ROOT].clone(self))
self._root_object = reader.root_object.clone(self)
self._root = self._root_object.indirect_reference # type: ignore[assignment]
self._pages = self._root_object.raw_get("/Pages")
self._flatten()
Expand Down Expand Up @@ -1165,10 +1175,10 @@ def clone_document_from_reader(
"""
self.clone_reader_document_root(reader)
if TK.INFO in reader.trailer:
self._info = reader.trailer[TK.INFO].clone(self).indirect_reference # type: ignore
self._info = reader._info.clone(self).indirect_reference # type: ignore
try:
self._ID = cast(ArrayObject, reader.trailer[TK.ID].clone(self))
except KeyError:
self._ID = cast(ArrayObject, reader._ID).clone(self)
except AttributeError:
pass
if callable(after_page_append):
for page in cast(
Expand Down Expand Up @@ -2546,7 +2556,7 @@ def merge(
else:
outline_item_typ = self.get_outline_root()

_ro = cast("DictionaryObject", reader.trailer[TK.ROOT])
_ro = reader.root_object
if import_outline and CO.OUTLINES in _ro:
outline = self._get_filtered_outline(
_ro.get(CO.OUTLINES, None), srcpages, reader
Expand All @@ -2569,7 +2579,7 @@ def merge(
self._root_object[NameObject("/AcroForm")] = self._add_object(
cast(
DictionaryObject,
cast(DictionaryObject, reader.trailer["/Root"])["/AcroForm"],
reader.root_object["/AcroForm"],
).clone(self, False, ("/Fields",))
)
arr = ArrayObject()
Expand All @@ -2580,7 +2590,7 @@ def merge(
)
trslat = self._id_translated[id(reader)]
try:
for f in reader.trailer["/Root"]["/AcroForm"]["/Fields"]: # type: ignore
for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore
try:
ind = IndirectObject(trslat[f.idnum], 0, self)
if ind not in arr:
Expand Down
Loading

0 comments on commit fb1f5df

Please sign in to comment.