DOC: Added that threads can now contain a metadata stream (#2711)

Also some very small changes including changing references to be from ISO 32000.
py-pdf · Jun 11, 2024 · a5a6974 · a5a6974
1 parent 26d1615
commit a5a6974
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 36 deletions.
diff --git a/docs/user/merging-pdfs.md b/docs/user/merging-pdfs.md
@@ -83,7 +83,7 @@ A grouping field should be added before adding the source PDF to prevent that.
 The original fields will be identified by adding the group name.
 
 For example, after calling `reader.add_form_topname("form1")`, the field
-previously named `field1` will now identified as `form1.field1` when calling
+previously named `field1` is now identified as `form1.field1` when calling
 `reader.get_form_text_fields(True)` or `reader.get_fields()`.
 
 After that, you can append the input PDF completely or partially using

diff --git a/docs/user/viewer-preferences.md b/docs/user/viewer-preferences.md
@@ -1,11 +1,11 @@
 # Adding Viewer Preferences
 
 It is possible to set viewer preferences of a PDF file.
-These properties are described in Section 12.2 of the [PDF 1.7 specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf).
+§12.2 of the [PDF 1.7 specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf).
 
 Note that the `/ViewerPreferences` dictionary does not exist by default.
 If it is not already present, it must be created by calling the `create_viewer_preferences` method
-of the `PdfWriter` object.
+of a `PdfWriter` object.
 
 If viewer preferences exist in a PDF file being read with `PdfReader`,
 you can access them as properties of `viewer_preferences`.

diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
@@ -108,12 +108,12 @@ class DocumentInformation(DictionaryObject):
     :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`.
 
     All text properties of the document metadata have
-    *two* properties, eg. author and author_raw. The non-raw property will
-    always return a ``TextStringObject``, making it ideal for a case where
-    the metadata is being displayed. The raw property can sometimes return
-    a ``ByteStringObject``, if pypdf was unable to decode the string's
-    text encoding; this requires additional safety in the caller and
-    therefore is not as commonly accessed.
+    *two* properties, e.g. author and author_raw. The non-raw property will
+    always return a ``TextStringObject``, making it ideal for a case where the
+    metadata is being displayed. The raw property can sometimes return a
+    ``ByteStringObject``, if pypdf was unable to decode the string's text
+    encoding; this requires additional safety in the caller and therefore is not
+    as commonly accessed.
     """
 
     def __init__(self) -> None:
@@ -391,7 +391,7 @@ def get_named_dest_root(self) -> ArrayObject:
                 dests = cast(DictionaryObject, names[CA.DESTS])
                 dests_ref = dests.indirect_reference
                 if CA.NAMES in dests:
-                    # TABLE 3.33 Entries in a name tree node dictionary
+                    # §7.9.6, entries in a name tree node dictionary
                     named_dest = cast(ArrayObject, dests[CA.NAMES])
                 else:
                     named_dest = ArrayObject()
@@ -449,8 +449,8 @@ def _get_named_destinations(
             # recurse down the tree
             for kid in cast(ArrayObject, tree[PA.KIDS]):
                 self._get_named_destinations(kid.get_object(), retval)
-        # TABLE 3.33 Entries in a name tree node dictionary (PDF 1.7 specs)
-        elif CA.NAMES in tree:  # KIDS and NAMES are exclusives (PDF 1.7 specs p 162)
+        # §7.9.6, entries in a name tree node dictionary
+        elif CA.NAMES in tree:  # /Kids and /Names are exclusives (§7.9.6)
             names = cast(DictionaryObject, tree[CA.NAMES])
             i = 0
             while i < len(names):
@@ -813,7 +813,7 @@ def _get_outline(
                 if isinstance(lines, NullObject):
                     return outline
 
-                # TABLE 8.3 Entries in the outline dictionary
+                # §12.3.3 Document outline, entries in the outline dictionary
                 if lines is not None and "/First" in lines:
                     node = cast(DictionaryObject, lines["/First"])
             self._namedDests = self._get_named_destinations()
@@ -847,8 +847,14 @@ def threads(self) -> Optional[ArrayObject]:
 
         See §12.4.3 from the PDF 1.7 or 2.0 specification.
 
-        It is an array of dictionaries with "/F" and "/I" properties or
+        It is an array of dictionaries with "/F" (the first bead in the thread)
+        and "/I" (a thread information dictionary containing information about
+        the thread, such as its title, author, and creation date) properties or
         None if there are no articles.
+
+        Since PDF 2.0 it can also contain an indirect reference to a metadata
+        stream containing information about the thread, such as its title,
+        author, and creation date.
         """
         catalog = self.root_object
         if CO.THREADS in catalog:
@@ -923,7 +929,7 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]:
         dest, title, outline_item = None, None, None
 
         # title required for valid outline
-        # PDF Reference 1.7: TABLE 8.4 Entries in an outline item dictionary
+        # § 12.3.3, entries in an outline item dictionary
         try:
             title = cast("str", node["/Title"])
         except KeyError:
@@ -998,16 +1004,16 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]:
     def pages(self) -> List[PageObject]:
         """
         Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`.
-        this property allows to get a page or a range of pages.
-
-        For PdfWriter Only:
-        Provides the capability to remove a page/range of page from the list
-        (using the del operator).
-        Note: only the page entry is removed. As the objects beneath can be used
-        elsewhere.
-        A solution to completely remove them - if they are not used anywhere -
-        is to write to a buffer/temporary file and to load it into a new PdfWriter
-        object afterwards.
+        This property allows to get a page or a range of pages.
+
+        Note:
+            For PdfWriter only: Provides the capability to remove a page/range of
+            page from the list (using the del operator). Remember: Only the page
+            entry is removed, as the objects beneath can be used elsewhere. A
+            solution to completely remove them - if they are not used anywhere - is
+            to write to a buffer/temporary file and then load it into a new
+            PdfWriter.
+
         """
         return _VirtualList(self.get_num_pages, self.get_page)  # type: ignore
 

diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py
@@ -814,7 +814,7 @@ def __init__(
         EFF: str,
         values: Optional[EncryptionValues],
     ) -> None:
-        # See TABLE 3.18 Entries common to all encryption dictionaries
+        # §7.6.2, entries common to all encryption dictionaries
         # use same name as keys of encryption dictionaries entries
         self.V = V
         self.R = R

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -1096,7 +1096,7 @@ def clone_reader_document_root(self, reader: PdfReader) -> None:
         should be considered.
 
         Args:
-            reader: PdfReader from the document root should be copied.
+            reader: PdfReader from which the document root should be copied.
         """
         self._objects.clear()
         self._root_object = reader.root_object.clone(self)
@@ -1256,7 +1256,7 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
                 existing workflow.
 
         Returns:
-            A tuple (bool, IO)
+            A tuple (bool, IO).
         """
         my_file = False
 
@@ -2238,7 +2238,7 @@ def add_annotation(
     def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:
         """
         Perform some clean up in the page.
-        Currently: convert NameObject nameddestination to TextStringObject
+        Currently: convert NameObject named destination to TextStringObject
         (required for names/dests list)
 
         Args:
@@ -2797,7 +2797,7 @@ def _insert_filtered_outline(
             self._insert_filtered_outline(dest._filtered_children, np, None)
 
     def close(self) -> None:
-        """To match the functions from Merger."""
+        """Implemented for API harmonization."""
         return
 
     def find_outline_item(

diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -134,8 +134,8 @@ def decode(
                 pass  # Usually an array with a null object was read
         # predictor 1 == no predictor
         if predictor != 1:
-            # The /Columns param. has 1 as the default value; see ISO 32000,
-            # §7.4.4.3 LZWDecode and FlateDecode Parameters, Table 8
+            # /Columns, the number of samples in each row, has a default value of 1;
+            # §7.4.4.3, ISO 32000.
             DEFAULT_BITS_PER_COMPONENT = 8
             try:
                 columns = cast(int, decode_parms[LZW.COLUMNS].get_object())  # type: ignore
@@ -528,7 +528,7 @@ def decode(
 
 
 class CCITParameters:
-    """TABLE 3.9 Optional parameters for the CCITTFaxDecode filter."""
+    """§7.4.6, optional parameters for the CCITTFaxDecode filter."""
 
     def __init__(self, K: int = 0, columns: int = 0, rows: int = 0) -> None:
         self.K = K
@@ -552,20 +552,20 @@ def group(self) -> int:
 
 class CCITTFaxDecode:
     """
-    See 3.3.5 CCITTFaxDecode Filter (PDF 1.7 Standard).
+    §7.4.6, CCITTFaxDecode filter (ISO 32000).
 
     Either Group 3 or Group 4 CCITT facsimile (fax) encoding.
     CCITT encoding is bit-oriented, not byte-oriented.
 
-    See: TABLE 3.9 Optional parameters for the CCITTFaxDecode filter
+    §7.4.6, optional parameters for the CCITTFaxDecode filter.
     """
 
     @staticmethod
     def _get_parameters(
         parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject],
         rows: int,
     ) -> CCITParameters:
-        # TABLE 3.9 Optional parameters for the CCITTFaxDecode filter
+        # §7.4.6, optional parameters for the CCITTFaxDecode filter
         k = 0
         columns = 1728
         if parameters: