MAINT: Add root_object, _info and _ID to PdfReader (#2495)

Use common functionality between PdfReader and PdfWriter.
py-pdf · Mar 3, 2024 · fb1f5df · fb1f5df
1 parent 6cf47c5
commit fb1f5df
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 63 deletions.
diff --git a/pypdf/_page_labels.py b/pypdf/_page_labels.py
@@ -11,12 +11,12 @@
 Example 1
 ---------
 
->>> reader.trailer["/Root"]["/PageLabels"]["/Nums"]
+>>> reader.root_object["/PageLabels"]["/Nums"]
 [0, IndirectObject(18, 0, 139929798197504),
  8, IndirectObject(19, 0, 139929798197504)]
->>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][1])
+>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
 {'/S': '/r'}
->>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][3])
+>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
 {'/S': '/D'}
 
 Example 2
@@ -57,7 +57,7 @@
                            aa to zz for the next 26, and so on)
 """
 
-from typing import Iterator, Optional, Tuple
+from typing import Iterator, Optional, Tuple, cast
 
 from ._protocols import PdfReaderProtocol
 from ._utils import logger_warning
@@ -127,10 +127,10 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str:
     Returns:
         The label of the page, e.g. "iv" or "4".
     """
-    root = reader.trailer["/Root"]
+    root = cast(DictionaryObject, reader.root_object)
     if "/PageLabels" not in root:
         return str(index + 1)  # Fallback
-    number_tree = root["/PageLabels"]
+    number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
     if "/Nums" in number_tree:
         # [Nums] shall be an array of the form
         #   [ key 1 value 1 key 2 value 2 ... key n value n ]
@@ -139,7 +139,7 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str:
         # The keys shall be sorted in numerical order,
         # analogously to the arrangement of keys in a name tree
         # as described in 7.9.6, "Name Trees."
-        nums = number_tree["/Nums"]
+        nums = cast(ArrayObject, number_tree["/Nums"])
         i = 0
         value = None
         start_index = 0

diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py
@@ -59,6 +59,10 @@ def pages(self) -> List[Any]:
     def trailer(self) -> Dict[str, Any]:
         ...
 
+    @property
+    def root_object(self) -> PdfObjectProtocol:
+        ...
+
     def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
         ...
 
@@ -67,6 +71,10 @@ class PdfWriterProtocol(Protocol):  # deprecated
     _objects: List[Any]
     _id_translated: Dict[int, Dict[int, int]]
 
+    @property
+    def root_object(self) -> PdfObjectProtocol:
+        ...
+
     def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
         ...
 

diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -282,9 +282,7 @@ class PdfReader:
     @property
     def viewer_preferences(self) -> Optional[ViewerPreferences]:
         """Returns the existing ViewerPreferences as an overloaded dictionary."""
-        o = cast(DictionaryObject, self.trailer["/Root"]).get(
-            CD.VIEWER_PREFERENCES, None
-        )
+        o = self.root_object.get(CD.VIEWER_PREFERENCES, None)
         if o is None:
             return None
         o = o.get_object()
@@ -344,6 +342,33 @@ def __init__(
         elif password is not None:
             raise PdfReadError("Not encrypted file")
 
+    @property
+    def root_object(self) -> DictionaryObject:
+        """Provide access to "/Root". standardized with PdfWriter."""
+        return cast(DictionaryObject, self.trailer[TK.ROOT].get_object())
+
+    @property
+    def _info(self) -> Optional[DictionaryObject]:
+        """
+        Provide access to "/Info". standardized with PdfWriter.
+
+        Returns:
+            /Info Dictionary ; None if the entry does not exists
+        """
+        info = self.trailer.get(TK.INFO, None)
+        return None if info is None else cast(DictionaryObject, info.get_object())
+
+    @property
+    def _ID(self) -> Optional[ArrayObject]:
+        """
+        Provide access to "/ID". standardized with PdfWriter.
+
+        Returns:
+            /ID array ; None if the entry does not exists
+        """
+        id = self.trailer.get(TK.ID, None)
+        return None if id is None else cast(ArrayObject, id.get_object())
+
     def _repr_mimebundle_(
         self,
         include: Union[None, Iterable[str]] = None,
@@ -400,21 +425,20 @@ def metadata(self) -> Optional[DocumentInformation]:
         """
         if TK.INFO not in self.trailer:
             return None
-        obj = self.trailer[TK.INFO]
         retval = DocumentInformation()
-        if isinstance(obj, type(None)):
+        if isinstance(self._info, type(None)):
             raise PdfReadError(
                 "trailer not found or does not point to document information directory"
             )
-        retval.update(obj)  # type: ignore
+        retval.update(self._info)  # type: ignore
         return retval
 
     @property
     def xmp_metadata(self) -> Optional[XmpInformation]:
         """XMP (Extensible Metadata Platform) data."""
         try:
             self._override_encryption = True
-            return self.trailer[TK.ROOT].xmp_metadata  # type: ignore
+            return self.root_object.xmp_metadata  # type: ignore
         finally:
             self._override_encryption = False
 
@@ -433,7 +457,7 @@ def _get_num_pages(self) -> int:
         # the PDF file's page count is used in this case. Otherwise,
         # the original method (flattened page count) is used.
         if self.is_encrypted:
-            return self.trailer[TK.ROOT]["/Pages"]["/Count"]  # type: ignore
+            return self.root_object["/Pages"]["/Count"]  # type: ignore
         else:
             if self.flattened_pages is None:
                 self._flatten()
@@ -493,7 +517,7 @@ def get_fields(
         field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict())
         if retval is None:
             retval = {}
-            catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
+            catalog = self.root_object
             # get the AcroForm tree
             if CD.ACRO_FORM in catalog:
                 tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])
@@ -755,7 +779,7 @@ def _get_named_destinations(
         """
         if retval is None:
             retval = {}
-            catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
+            catalog = self.root_object
 
             # get the name tree
             if CA.DESTS in catalog:
@@ -822,7 +846,7 @@ def _get_outline(
     ) -> OutlineType:
         if outline is None:
             outline = []
-            catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
+            catalog = self.root_object
 
             # get the outline dictionary and named destinations
             if CO.OUTLINES in catalog:
@@ -868,7 +892,7 @@ def threads(self) -> Optional[ArrayObject]:
         It's an array of dictionaries with "/F" and "/I" properties or
         None if there are no articles.
         """
-        catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
+        catalog = self.root_object
         if CO.THREADS in catalog:
             return cast("ArrayObject", catalog[CO.THREADS])
         else:
@@ -1071,9 +1095,8 @@ def page_layout(self) -> Optional[str]:
            * - /TwoPageRight
              - Show two pages at a time, odd-numbered pages on the right
         """
-        trailer = cast(DictionaryObject, self.trailer[TK.ROOT])
-        if CD.PAGE_LAYOUT in trailer:
-            return cast(NameObject, trailer[CD.PAGE_LAYOUT])
+        if CD.PAGE_LAYOUT in self.root_object:
+            return cast(NameObject, self.root_object[CD.PAGE_LAYOUT])
         return None
 
     @property
@@ -1098,7 +1121,7 @@ def page_mode(self) -> Optional[PagemodeType]:
              - Show attachments panel
         """
         try:
-            return self.trailer[TK.ROOT]["/PageMode"]  # type: ignore
+            return self.root_object["/PageMode"]  # type: ignore
         except KeyError:
             return None
 
@@ -1119,12 +1142,12 @@ def _flatten(
         if pages is None:
             # Fix issue 327: set flattened_pages attribute only for
             # decrypted file
-            catalog = self.trailer[TK.ROOT].get_object()
-            pages = catalog["/Pages"].get_object()  # type: ignore
+            catalog = self.root_object
+            pages = cast(DictionaryObject, catalog["/Pages"].get_object())
             self.flattened_pages = []
 
         if PA.TYPE in pages:
-            t = pages[PA.TYPE]
+            t = cast(str, pages[PA.TYPE])
         # if pdf has no type, considered as a page if /Kids is missing
         elif PA.KIDS not in pages:
             t = "/Page"
@@ -1925,7 +1948,7 @@ def is_encrypted(self) -> bool:
     def xfa(self) -> Optional[Dict[str, Any]]:
         tree: Optional[TreeObject] = None
         retval: Dict[str, Any] = {}
-        catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
+        catalog = self.root_object
 
         if "/AcroForm" not in catalog or not catalog["/AcroForm"]:
             return None
@@ -1955,7 +1978,7 @@ def add_form_topname(self, name: str) -> Optional[DictionaryObject]:
         Returns:
             The created object. ``None`` means no object was created.
         """
-        catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
+        catalog = self.root_object
 
         if "/AcroForm" not in catalog or not isinstance(
             catalog["/AcroForm"], DictionaryObject
@@ -1997,7 +2020,7 @@ def rename_form_topname(self, name: str) -> Optional[DictionaryObject]:
         Returns:
             The modified object. ``None`` means no object was modified.
         """
-        catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
+        catalog = self.root_object
 
         if "/AcroForm" not in catalog or not isinstance(
             catalog["/AcroForm"], DictionaryObject
@@ -2030,7 +2053,7 @@ def _list_attachments(self) -> List[str]:
         Returns:
             list of filenames
         """
-        catalog = cast(DictionaryObject, self.trailer["/Root"])
+        catalog = self.root_object
         # From the catalog get the embedded file names
         try:
             filenames = cast(
@@ -2068,7 +2091,7 @@ def _get_attachments(
             dictionary of filename -> Union[bytestring or List[ByteString]]
             if the filename exists multiple times a List of the different version will be provided
         """
-        catalog = cast(DictionaryObject, self.trailer["/Root"])
+        catalog = self.root_object
         # From the catalog get the embedded file names
         try:
             filenames = cast(

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -211,6 +211,16 @@ def __init__(
         self._encrypt_entry: Optional[DictionaryObject] = None
         self._ID: Union[ArrayObject, None] = None
 
+    @property
+    def root_object(self) -> DictionaryObject:
+        """
+        Provide direct access to Pdf Structure
+
+        Note:
+            Recommended be used only for read access
+        """
+        return self._root_object
+
     def __enter__(self) -> "PdfWriter":
         """Store that writer is initialized by 'with'."""
         self.with_as_usage = True
@@ -1084,7 +1094,7 @@ def clone_reader_document_root(self, reader: PdfReader) -> None:
             reader: PdfReader from the document root should be copied.
         """
         self._objects.clear()
-        self._root_object = cast(DictionaryObject, reader.trailer[TK.ROOT].clone(self))
+        self._root_object = reader.root_object.clone(self)
         self._root = self._root_object.indirect_reference  # type: ignore[assignment]
         self._pages = self._root_object.raw_get("/Pages")
         self._flatten()
@@ -1165,10 +1175,10 @@ def clone_document_from_reader(
         """
         self.clone_reader_document_root(reader)
         if TK.INFO in reader.trailer:
-            self._info = reader.trailer[TK.INFO].clone(self).indirect_reference  # type: ignore
+            self._info = reader._info.clone(self).indirect_reference  # type: ignore
         try:
-            self._ID = cast(ArrayObject, reader.trailer[TK.ID].clone(self))
-        except KeyError:
+            self._ID = cast(ArrayObject, reader._ID).clone(self)
+        except AttributeError:
             pass
         if callable(after_page_append):
             for page in cast(
@@ -2546,7 +2556,7 @@ def merge(
         else:
             outline_item_typ = self.get_outline_root()
 
-        _ro = cast("DictionaryObject", reader.trailer[TK.ROOT])
+        _ro = reader.root_object
         if import_outline and CO.OUTLINES in _ro:
             outline = self._get_filtered_outline(
                 _ro.get(CO.OUTLINES, None), srcpages, reader
@@ -2569,7 +2579,7 @@ def merge(
                 self._root_object[NameObject("/AcroForm")] = self._add_object(
                     cast(
                         DictionaryObject,
-                        cast(DictionaryObject, reader.trailer["/Root"])["/AcroForm"],
+                        reader.root_object["/AcroForm"],
                     ).clone(self, False, ("/Fields",))
                 )
                 arr = ArrayObject()
@@ -2580,7 +2590,7 @@ def merge(
                 )
             trslat = self._id_translated[id(reader)]
             try:
-                for f in reader.trailer["/Root"]["/AcroForm"]["/Fields"]:  # type: ignore
+                for f in reader.root_object["/AcroForm"]["/Fields"]:  # type: ignore
                     try:
                         ind = IndirectObject(trslat[f.idnum], 0, self)
                         if ind not in arr: