diff --git a/pypdf/__init__.py b/pypdf/__init__.py index 2acfad672..5e2e06925 100644 --- a/pypdf/__init__.py +++ b/pypdf/__init__.py @@ -1,12 +1,16 @@ -from pypdf.pdf import PdfFileReader, PdfFileWriter -from pypdf.merger import PdfFileMerger -from pypdf.pagerange import PageRange -from pypdf._version import __version__ +from .pdf import PdfFileReader, PdfFileWriter +from .generic import * +from .merger import PdfFileMerger +from .pagerange import PageRange +from ._version import __version__ __all__ = [ # Basic PyPDF elements "PdfFileReader", "PdfFileWriter", "PdfFileMerger", "PageRange", + # most used elements from generic + "BooleanObject","ArrayObject","IndirectObject","FloatObject","NumberObject","createStringObject", + "TextStringObject","NameObject","DictionaryObject","TreeObject","Destination","PageLabel","Bookmark", # PyPDF modules "pdf", "generic", "utils", "filters", "merger", "pagerange", "xmp" ] diff --git a/pypdf/_version.py b/pypdf/_version.py index e2ba8ba3b..c30e09392 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = '1.27.0' +__version__ = '1.27.0PPzz' diff --git a/pypdf/filters.py b/pypdf/filters.py index 8459c0132..5fb5149a0 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -1,4 +1,3 @@ -# -*- coding: UTF-8 -*- # vim: sw=4:expandtab:foldmethod=marker # # Copyright (c) 2006, Mathieu Fenniak @@ -31,13 +30,14 @@ Implementation of stream filters for PDF. """ +import math import base64 import struct from sys import version_info -from pypdf import generic -from pypdf.generic import * -from pypdf.utils import PdfReadError, pypdfOrd, paethPredictor, PdfStreamError +from . import generic +from .generic import * +from .utils import PdfReadError, pypdfOrd, paethPredictor,PdfStreamError try: import zlib @@ -193,7 +193,7 @@ def decode(data, decodeParms=None): prev_rowdata = rowdata - for d in rowdata: + for d in rowdata[1:]: ##ppZZ ???? err in latest version if version_info < (3, 0): output.write(chr(d)) else: diff --git a/pypdf/generic.py b/pypdf/generic.py index d39355134..bdec3f700 100644 --- a/pypdf/generic.py +++ b/pypdf/generic.py @@ -38,8 +38,9 @@ import warnings from io import BytesIO -from pypdf.utils import * -from pypdf.utils import pypdfBytes as b_, pypdfUnicode as u_ +#from . import utils +from .utils import * +from .utils import pypdfUnicode as u_, pypdfBytes as b_ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" @@ -98,6 +99,10 @@ def getObject(self): """Resolves indirect references.""" return self + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + raise Exception("clone PdfObject") + return self # TO-DO Add __repr_() implementations to the *Object classes class NullObject(PdfObject): @@ -114,10 +119,19 @@ def readFromStream(stream): return NullObject() + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + return NullObject() + + class BooleanObject(PdfObject): def __init__(self, value): self.value = value + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + return BooleanObject(self.value) + def writeToStream(self, stream, encryption_key): if self.value: stream.write(b_("true")) @@ -139,6 +153,17 @@ def readFromStream(stream): class ArrayObject(list, PdfObject): + + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + arr = ArrayObject() + for data in self: + if 'clone' in dir(data): + arr.append(data.clone(pdfD)) + else: + arr.append(data) + return arr + def writeToStream(self, stream, encryption_key): stream.write(b_("[")) @@ -196,6 +221,22 @@ def __init__(self, idnum, generation, pdf): self.generation = generation self.pdf = pdf + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + try: pdfD._IdTranslated + except: + pdfD._IdTranslated={} + try: + n=pdfD._IdTranslated[self.idnum] + except: + n=len(pdfD._objects)+1 + pdfD._IdTranslated[self.idnum]=n + pdfD._objects.append("%d NotInit"%n) + o=self.getObject().clone(pdfD) + pdfD._objects[n-1]=o + + return IndirectObject(n,0,pdfD) + def getObject(self): return self.pdf.getObject(self).getObject() @@ -265,6 +306,10 @@ def __new__(cls, value="0", context=None): except: return decimal.Decimal.__new__(cls, str(value)) + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + return FloatObject(self.asNumeric()) + def __repr__(self): if self == self.to_integral(): return str(self.quantize(decimal.Decimal(1))) @@ -294,6 +339,10 @@ def __new__(cls, value): except OverflowError: return int.__new__(cls, 0) + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + return NumberObject(self.asNumeric()) + def asNumeric(self): return int(b_(repr(self))) @@ -444,6 +493,10 @@ class ByteStringObject(bytes_type, PdfObject): # returns self. original_bytes = property(lambda self: self) + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + return ByteStringObject(self) + def writeToStream(self, stream, encryption_key): bytearr = self @@ -471,6 +524,10 @@ class TextStringObject(string_type, PdfObject): # back-calculate what the original encoded bytes were. original_bytes = property(lambda self: self.getOriginalBytes()) + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + return createStringObject(self) + def getOriginalBytes(self): # We're a text string object, but the library is trying to get our raw # bytes. This can happen if we auto-detected this string as text, but @@ -513,6 +570,10 @@ class NameObject(str, PdfObject): delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) surfix = b_("/") + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + return NameObject(self) + def writeToStream(self, stream, encryption_key): stream.write(b_(self)) @@ -549,6 +610,15 @@ def readFromStream(stream, pdf): class DictionaryObject(dict, PdfObject): + + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + d=DictionaryObject() + for k,v in self.items(): + d.update({(k.clone(k) if 'clone' in dir(k) else k): + (v.clone(pdfD) if 'clone' in dir(v) else v) }) + return d + def rawGet(self, key): return dict.__getitem__(self, key) @@ -734,7 +804,15 @@ def readFromStream(stream, pdf): class TreeObject(DictionaryObject): def __init__(self): - DictionaryObject.__init__() + DictionaryObject.__init__(self) + + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + raise Exception("clone TreeObject",self) + obj=TreeObject() + for k,v in self.items(): + obj.addChild(v.clone(pdfD),pdfD) + return obj def hasChildren(self): return '/First' in self @@ -875,6 +953,14 @@ def __init__(self): self._data = None self.decodedSelf = None + def clone(self,pdfD): #PPzz + """ clone object into pdfD """ + st=self.__class__() + st._data=self._data + st.decodedSelf=self.decodedSelf + st.update(self) + return self + def writeToStream(self, stream, encryption_key): self[NameObject("/Length")] = NumberObject(len(self._data)) DictionaryObject.writeToStream(self, stream, encryption_key) @@ -906,7 +992,7 @@ def initializeFromDictionary(data): return retval def flateEncode(self): - from pypdf.filters import FlateCodec + from .filters import FlateCodec if "/Filter" in self: f = self["/Filter"] @@ -934,7 +1020,7 @@ def __init__(self): self.decodedSelf = None def getData(self): - from pypdf.filters import decodeStreamData + from .filters import decodeStreamData if self.decodedSelf: # Cached version of decoded object @@ -2098,6 +2184,7 @@ def __init__(self, title, page, typ, *args): self[NameObject("/Title")] = title self[NameObject("/Page")] = page self[NameObject("/Type")] = typ + self.parent=None #PPzz # from table 8.2 of the PDF 1.7 reference. if typ == "/XYZ": @@ -2195,6 +2282,106 @@ def writeToStream(self, stream, encryption_key): :rtype: ``int``, or ``None`` if not available. """ +class PageLabel(): + def __init__(self,pn=0,defObject=None): + """ + :param + integer pn: 1st Page of the group + defObject: tuple (1stPage,prefix,increment) or DictionnaryObject from the file + """ + + if defObject is None: + defObject = DictionaryObject() + + try: + if type(defObject) != tuple: + self.prefix=defObject['/P'] + else: + self.prefix=defObject[1]+""#None will induce and error and reach default value + except: + self.prefix='' + + try: + if type(defObject) != tuple: + self.numbering=defObject['/S'] + else: + self.numbering=defObject[2]+""#None will induce and error and reach default value + except: + self.numbering='/D' if self.prefix == "" else "" + + self.pn=pn #1st page of the range + try: + if type(defObject) != tuple: + self.first=int(defObject['/St'])-pn + else: + self.first=max(1,int(defObject[0]))-pn #None will induce and error and reach default value + except: + self.first=1-pn + + def __repr__(self): + return "PageLabel Obj(@%r :%s-%s)" % (self.first, self.prefix, self.numbering) + + def buildDefinition(self,pn=None): + """ + build the DictionnaryObjecgt to inject into the PDF + """ + o=DictionaryObject() + if self.numbering!='/D' or self.prefix!='': + o.update({ NameObject("/S"):NameObject(self.numbering) }) + if self.prefix!='': + o.update({ NameObject("/P"):NameObject(self.prefix) }) + if pn==None: + o.update({ NameObject("/St"):NumberObject(self.first+self.pn) }) + elif pn==0: + pass; #No start value + else: + o.update({ NameObject("/St"):NumberObject(pn) }) + return o + + def getLabel(self,pn): + def int_to_Roman(num): + val = [ + 1000, 900, 500, 400, + 100, 90, 50, 40, + 10, 9, 5, 4, + 1 + ] + syb = [ + "M", "CM", "D", "CD", + "C", "XC", "L", "XL", + "X", "IX", "V", "IV", + "I" + ] + roman_num = '' + i = 0 + while num > 0: + for _ in range(num // val[i]): + roman_num += syb[i] + num -= val[i] + i += 1 + return roman_num + + def int_to_Alpha(num): + t="" + while(num>0): + num=num-1 + t=chr(num%26+65)+t + num=num//26 + return t + if self.numbering=='/D': + st=str(pn+self.first) + elif self.numbering=='/R': + st=int_to_Roman(pn+self.first) + elif self.numbering=='/r': + st=int_to_Roman(pn+self.first).lower() + elif self.numbering=='/A': + st=int_to_Alpha(pn+self.first) + elif self.numbering=='/a': + st=int_to_Alpha(pn+self.first).lower() + else: + st='' + return self.prefix+st + class Bookmark(Destination): def writeToStream(self, stream, encryption_key): diff --git a/pypdf/merger.py b/pypdf/merger.py index 8a029ea22..39a325d73 100644 --- a/pypdf/merger.py +++ b/pypdf/merger.py @@ -207,9 +207,12 @@ def append(self, fileobj, bookmark=None, pages=None, importBookmarks=True): """ self.merge(len(self._pages), fileobj, bookmark, pages, importBookmarks) - def write(self): + def write(self, fileobj=None): """ Writes all data that has been merged to the given output file. + + :param fileobj: Output file. Can be a filename or any kind of + file-like object. """ for page in self._pages: self._writer.addPage(page.pagedata) @@ -222,7 +225,7 @@ def write(self): self._writeBookmarks() # Write the output to the file - self._writer.write() + self._writer.write(fileobj) def close(self): """ diff --git a/pypdf/pagerange.py b/pypdf/pagerange.py index f8296d8de..cdf5e7acb 100644 --- a/pypdf/pagerange.py +++ b/pypdf/pagerange.py @@ -8,8 +8,7 @@ """ import re - -from pypdf.utils import isString +from .utils import isString _INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0". PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE) diff --git a/pypdf/pdf.py b/pypdf/pdf.py index 82ecf925e..3cbf7c831 100644 --- a/pypdf/pdf.py +++ b/pypdf/pdf.py @@ -39,14 +39,17 @@ import random import struct import time +import datetime +import sys import uuid from hashlib import md5 +from types import MethodType from sys import version_info -from pypdf import utils -from pypdf.generic import * -from pypdf.utils import * -from pypdf.utils import pypdfBytes as b_ +from . import utils +from .generic import * +from .utils import * +from .utils import pypdfBytes as b_ if version_info < (3, 0): from cStringIO import StringIO @@ -54,6 +57,8 @@ else: from io import StringIO, BytesIO +import warnings +import codecs __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" @@ -62,7 +67,7 @@ class PdfFileWriter(object): - def __init__(self, stream, debug=False): + def __init__(self, stream=None, pdfReaderAsSource=None,debug=False): """ This class supports writing PDF files out, given pages produced by another class (typically :class:`PdfFileReader`). @@ -70,6 +75,8 @@ def __init__(self, stream, debug=False): :param stream: File-like object or path to a PDF file in ``str`` format. If of the former type, the object must support the ``write()`` and the ``tell()`` methods. + :para pdfReaderAsSource: pdfFileReader object : + if passed, it is cloned into the writer :param bool debug: Whether this class should emit debug informations (recommended for development). Defaults to False. """ @@ -89,6 +96,15 @@ def __init__(self, stream, debug=False): "written to correctly." % self._stream.name ) + # to be done before cloning alternative + self._flattenPageLabels=None + # copy compatible methods from Reader + self.getPageLabel=MethodType(PdfFileReader.getPageLabel,self) + + if isinstance(pdfReaderAsSource,PdfFileReader): + self.clone(pdfReaderAsSource) + return + # The root of our page tree node. pages = DictionaryObject() pages.update({ @@ -157,6 +173,14 @@ def isClosed(self): """ return not bool(self._stream) or self._stream.closed + def clone(self,pdfR): #ppZZ + self._IdTranslated={} + tr=pdfR._trailer.clone(self) + self._pages=tr['/Root'].rawGet('/Pages') + self._info=tr.rawGet('/Info') + self._rootObject=tr['/Root'] + self._root=tr.rawGet('/Root') + def _addObject(self, obj): self._objects.append(obj) @@ -168,16 +192,29 @@ def getObject(self, ido): return self._objects[ido.idnum - 1] - def _addPage(self, page, action): - if page["/Type"] != "/Page": - raise ValueError("Page type is not /Page") - - page[NameObject("/Parent")] = self._pages - pages = self.getObject(self._pages) - action(pages["/Kids"], self._addObject(page)) - - pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) - + def _insertPage(self, page, pageNumber): + assert page["/Type"] == "/Page" + pn=self._pages.getObject()['/Count'] + if pageNumber>=pn: + nextPage,firstPageNum=self._getPage(pn-1,self._pages,0) + pageNumber=pn + else: + nextPage,firstPageNum=self._getPage(pageNumber,self._pages,0) + + nextPage=nextPage.getObject() + pages=nextPage['/Parent'] + pp=nextPage.rawGet('/Parent') + page[NameObject("/Parent")] = pp + pages["/Kids"].insert(pageNumber-firstPageNum,self._addObject(page)) + + while pp is not None: + pp1=pp.getObject() + pp1[NameObject("/Count")] = NumberObject(pp1["/Count"] + 1) + if pp==self._pages: + pp=None + else: + pp=pp1.rawGet("/Parent") + def addPage(self, page): """ Adds a page to this PDF file. The page is usually acquired from a @@ -186,7 +223,7 @@ def addPage(self, page): :param PageObject page: The page to add to the document. Should be an instance of :class:`PageObject` """ - self._addPage(page, list.append) + self.insertPage(page,self._pages.getObject()["/Count"]) def insertPage(self, page, index=0): """ @@ -197,20 +234,88 @@ def insertPage(self, page, index=0): should be an instance of :class:`PageObject`. :param int index: Position at which the page will be inserted. """ - self._addPage(page, lambda l, p: l.insert(index, p)) + assert 0<=index<=self._pages.getObject()["/Count"] + self._insertPage(page,index) - def getPage(self, pageNumber): + def _getPage(self,pageNum,node,firstPageNum): #ppZZ + """ + internal + :param int pageNum: page searched for + :param IndirectObject node: point to a page or pages within the page tree + :param int firstPageNum: page number of the first page of the tree below + + """ + if node.getObject()['/Type'] == '/Page': # it is only one page we have to check + if pageNum == firstPageNum : + return node,-1 #One Page is not a group, we have to return -1 in order to return the 1st page number of the group + else: + return firstPageNum+1,-1 # return next first page number + elif firstPageNum <= pageNum ` """ - pages = self.getObject(self._pages) - # XXX: crude hack - return pages["/Kids"][pageNumber].getObject() + assert 0<=pageNumber to write to is not in binary mode. It may not be written to correctly." % stream.name) + + if stream is not None: + savedStream=self._stream + self._stream=stream + try: + self.write() + finally: + self._stream=savedStream + return + if not self._root: self._root = self._addObject(self._rootObject) @@ -692,6 +822,10 @@ def getReference(self, obj): return ref + def getIndirectObject(self, idnum): #ppZZ + ref = IndirectObject(idnum, 0, self) + return ref + def getOutlineRoot(self): if '/Outlines' in self._rootObject: outline = self._rootObject['/Outlines'] @@ -707,47 +841,64 @@ def getOutlineRoot(self): return outline - def getNamedDestRoot(self): - if '/Names' in self._rootObject and \ - isinstance(self._rootObject['/Names'], DictionaryObject): - names = self._rootObject['/Names'] - idnum = self._objects.index(names) + 1 - namesRef = IndirectObject(idnum, 0, self) + + #Copied from Reader + def _buildDestination(self, title, array): + return Destination(title, array[0], array[1], *array[2:]) + + def getNamedDestinations(self, tree=None, retval=None): + """ + Retrieves the named destinations present in the document. + + :return: a dictionary which maps names to + :class:`Destinations`. + :rtype: dict + """ + if retval is None: + retval = {} + catalog = self._rootObject + + # get the name tree + if "/Dests" in catalog: + tree = catalog["/Dests"] + elif "/Names" in catalog: + names = catalog['/Names'] + if "/Dests" in names: + tree = names['/Dests'] - assert namesRef.getObject() == names + if tree is None: + return retval - if '/Dests' in names and \ - isinstance(names['/Dests'], DictionaryObject): - dests = names['/Dests'] - idnum = self._objects.index(dests) + 1 - destsRef = IndirectObject(idnum, 0, self) + if "/Kids" in tree: + # recurse down the tree + for kid in tree["/Kids"]: + self.getNamedDestinations(kid.getObject(), retval) - assert destsRef.getObject() == dests + elif "/Names" in tree: #ppZZ if => elif + names = tree["/Names"] + for i in range(0, len(names), 2): + key = names[i].getObject() + val = names[i+1].getObject() - if '/Names' in dests: - nd = dests['/Names'] - else: - nd = ArrayObject() - dests[NameObject('/Names')] = nd - else: - dests = DictionaryObject() - destsRef = self._addObject(dests) - names[NameObject('/Dests')] = destsRef - nd = ArrayObject() - dests[NameObject('/Names')] = nd + if isinstance(val, DictionaryObject) and '/D' in val: + val = val['/D'] - else: - names = DictionaryObject() - namesRef = self._addObject(names) - self._rootObject[NameObject('/Names')] = namesRef - dests = DictionaryObject() - destsRef = self._addObject(dests) - names[NameObject('/Dests')] = destsRef - nd = ArrayObject() - dests[NameObject('/Names')] = nd + dest = self._buildDestination(key, val) + if dest is not None: + retval[key] = dest + else: # case where Dests is in root catalog + for k,v in tree.items(): + val=v.getObject() + if isinstance(val, DictionaryObject) and '/D' in val: + val = val['/D'] + dest = self._buildDestination(k,val) + if dest != None: + retval[k] = dest + + return retval - return nd + #bookmarks are added in def addBookmarkDestination(self, dest, parent=None): destRef = self._addObject(dest) @@ -788,8 +939,7 @@ def addBookmarkDict(self, bookmark, parent=None): def addBookmark( self, title, pagenum, parent=None, color=None, bold=False, - italic=False, fit='/Fit', *args - ): + italic=False, fit='/Fit', *args): """ Add a bookmark to this PDF file. @@ -804,7 +954,7 @@ def addBookmark( :param str fit: The fit of the destination page. See :meth:`addLink()` for details. """ - pageRef = self.getObject(self._pages)['/Kids'][pagenum] + pageRef = self.getPage(pagenum,True) action = DictionaryObject() zoomArgs = [] @@ -853,40 +1003,303 @@ def addBookmark( return bookmarkRef - def addNamedDestinationObject(self, dest): - destRef = self._addObject(dest) + def addNamedDestinationObject(self, dest,title=None): + def _getMinMaxKey(node,_min=True): + if "/Names" in node: + return node["/Names"][0 if _min else -2] + elif "/Kids" in node: + return _getMinMaxKey(node["/Kids"][0 if _min else -1].getObject(),_min) + else: + raise Exception("_getMinMaxKey abnormal") + + def _insertNamedDest(title,dest,node,force=0): + if "/Limits" in node: + mi,ma=node['/Limits'][0:2] + elif ("/Kids" in node and len(node["/Kids"]) == 0) : + raise Exception("Kids list empty ???") + elif ("/Names" in node and len(node["/Names"]) == 0): + title=TextStringObject(title) + node['/Names'].append(title) + node['/Names'].append(dest) + node.update({NameObject('/Limits'):ArrayObject([title,title])}) + return node['/Limits'] + else: #there is some data but no Limits(it should not exists + mi=_getMinMaxKey(node,True) + ma=_getMinMaxKey(node,False) + + if "/Names" in node: #it is a list of names + if titlenode['/Limits'][1]: + node['/Limits'][1]=title + return node['/Limits'] + else: + return None + elif "/Kids" in node: #need to process one level down + if force == 1: + lim=_insertNamedDest(title,dest,node['/Kids'][-1].getObject(),+1) + if '/Limits' not in node: node.update({ NameObject('/Limits') : ArrayObject([ mi, lim[1] ]) }) + node['/Limits'][1]=lim[1] + return node['/Limits'] + elif title=0 + + def removeAnnots(self,pageSet=None,links=False,comments=False,attachments=False,prints=False,_3D=False): + """ + Removes different annotations from this output. + """ + if pageSet is None: + pageSet=range(self.numPages) + + #if all are false, for compatibility, they should be all deleted + if not(links or comments or attachments or prints or _3D): + links=True + comments=True + attachments=True + prints=True + _3D=True + subTypes=[] + if links: + subTypes.extend(['/Link',]) + if comments: + subTypes.extend(['/Text','/FreeText','/Line','/Square','/Circle','/Polygon','/PolyLine',\ + '/Highlight','/Underline','/Squiggly','/StrikeOut','/Stamp','/Caret',\ + '/Ink','/Popup',]) + if attachments: + subTypes.extend(['/FileAttachment','/Sound','/Movie','/Widget','/Screen',]) + if prints: + subTypes.extend(['/PrinterMark','/TrapNet','/Watermark',]) + if _3D: + subTypes.extend(['/3D']) + + for i in pageSet: + page = self.getPage(i) + if "/Annots" in page: + ik=0 + while iknode['/Limits'][1]: + node['/Limits'][1]=pn + return node['/Limits'] + else: + return None + elif "/Kids" in node: #need to process one level down + if force == 1: + lim=_insertPageLabel(pn,pagelbl,node['/Kids'][-1].getObject(),+1) + if '/Limits' not in node: node.update({ NameObject('/Limits') : ArrayObject([ mi, lim[1] ]) }) + node['/Limits'][1]=lim[1] + return node['/Limits'] + elif pn elif names = tree["/Names"] for i in range(0, len(names), 2): key = names[i].getObject() @@ -1615,6 +2248,12 @@ def getNamedDestinations(self, tree=None, retval=None): dest = self._buildDestination(key, val) if dest is not None: retval[key] = dest + else: # case where Dests is in root catalog + for k,v in tree.items(): + val=v.getObject() + dest = self._buildDestination(k,val) + if dest != None: + retval[k] = dest return retval @@ -1627,7 +2266,7 @@ def getOutlines(self, node=None, outlines=None): """ if outlines is None: outlines = [] - catalog = self._trailer["/Root"] + catalog = self._rootObject # get the outline dictionary and named destinations if "/Outlines" in catalog: @@ -1739,6 +2378,19 @@ def _buildOutline(self, node): outline[NameObject("/Title")] = title else: raise PdfReadError("Unexpected destination %r" % dest) + + #ppZZ : add parent + outline.parent=None + if "/Parent" in node: + p=node["/Parent"].getObject() + try: + if "/Type" in p and p["/Type"] == '/Outlines': + outline.parent=None + elif "/Title" in p and p["/Title"] != '': + outline.parent=node["/Parent"] + except: + pass + return outline pages = property( @@ -1752,6 +2404,51 @@ def _buildOutline(self, node): :meth:`getPage()` methods. """ + def getPageLabel(self,num): + def findPageLblEntry(num): + #there will be always 0 that will match... + k1=-.5 + for k in sorted(self._flattenPageLabels.keys()): + if k>num: break + k1=k + + if num!=k: + k=k1 + return self._flattenPageLabels[k].getLabel(num) + + def flattenPageLabel(node=None): + flat={} + """ + the default value we use this value in order to have a + default value that will be overriden by 0 if provided and + if we want to check that there is a definition for page 0 + """ + flat[-0.5]=PageLabel(0,(0,'','/D')) + if node is None: + p1=self._rootObject + if "/PageLabels" in p1: + node=p1["/PageLabels"] + else: + return flat + if '/Nums' in node: + node=node['/Nums'].getObject() + for i in range(len(node)//2): + o=PageLabel(node[2*i],node[2*i+1].getObject()) + flat[node[2*i]]=o + elif '/Kids' in node: + for k in node['/Kids']: + flat.update(flattenPageLabel(k.getObject())) + else: + raise Exception("issue processing PageLabels") + return flat + + if self._flattenPageLabels is None: + self._flattenPageLabels =flattenPageLabel() + assert 0 <= num < self.numPages,"Page Number out of range" + return findPageLblEntry(num) + + + @property def pageLayout(self): """ @@ -1791,7 +2488,7 @@ def _flatten(self, pages=None, inherit=None, indirectRef=None): inherit = dict() if pages is None: self._flattenedPages = [] - catalog = self._trailer["/Root"].getObject() + catalog = self._rootObject pages = catalog["/Pages"].getObject() t = "/Pages" @@ -2254,8 +2951,10 @@ def usedBefore(num, generation): if "/XRefStm" in newTrailer: startxref = newTrailer["/XRefStm"] + del self._trailer["/XRefStm"] #to ensure there will be no loops elif "/Prev" in newTrailer: startxref = newTrailer["/Prev"] + del self._trailer["/Prev"] #to ensure there will be no loops else: break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream @@ -2342,14 +3041,16 @@ def usedBefore(num, generation): 1, xrefstreamOffset, xrefstmGen ) - trailerKeys = ("/Root", "/Encrypt", "/Info", "/ID") + trailerKeys = ("/Root", "/Encrypt", "/Info", "/ID","/Prev") for key in trailerKeys: if key in xrefstream and key not in self._trailer: self._trailer[NameObject(key)] = xrefstream.rawGet(key) - if "/Prev" in xrefstream: - startxref = xrefstream["/Prev"] + #based on other software, the Previous Prev shall also be processed... + if "/Prev" in self._trailer: ##ppZZ : /Prev was collected/updated before + startxref = self._trailer["/Prev"] + del self._trailer["/Prev"] #to ensure there will be no loops else: break else: @@ -2605,6 +3306,12 @@ def _authenticateUserPassword(self, password): def isEncrypted(self): return "/Encrypt" in self._trailer + def getIndirectObject(self, idnum): #ppZZ + ref = IndirectObject(idnum, 0, self) + return ref + + + def _convertToInt(d, size): if size > 8: diff --git a/samplecode/MergingComments.py b/samplecode/MergingComments.py new file mode 100644 index 000000000..1c4072fa5 --- /dev/null +++ b/samplecode/MergingComments.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +""" + test/demo program that copy alll comments from multiples pdf into one command line: + PDFCommentsMerge [-d] [-o output.pdf] [input1.pdf] ... [inputN.pdf] + -d: open Excel output at the end of extraction + -o: prode the output Excel name/path ; if not present the file is created + in temp folder named "FullCommented **input1**.pdf" + if no parameters (mainly for idle test), the pdf filenames re asked for + empty to finish +""" +import sys +import os +import pypdf as PDF; + + +if sys.argv[0].upper().find("PYTHON.EXE")>=0: + del sys.argv[0] +del sys.argv[0] # to ignore called program + +displayOutput=('-d' in sys.argv) or ('idlelib.run' in sys.modules) +try: + del sys.argv[sys.argv.index('-d')] +except: + pass + + +if (len(sys.argv)==0) or (('-o' in sys.argv) and (len(sys.argv)<=2)) : + print(globals()['__doc__']) + while True: + t=input("pdf file to scan:") + if t=='':break + sys.argv.append(t) + +if '-o' in sys.argv: + i=sys.argv.index('-o') + outFile=sys.argv[i+1] + del sys.argv[i] + del sys.argv[i] +else: + tempFolder=os.environ['TEMP'].replace('\\','/') + if tempFolder[-1]!='/' : tempFolder+='/' + outFile=tempFolder+"FullCommented "+os.path.splitext(os.path.split(sys.argv[0])[-1])[0]+'.pdf' + +pdfO=PDF.PdfFileWriter(None,PDF.PdfFileReader(sys.argv[0])) +del sys.argv[0] + +pdfS=[] +for f in sys.argv: + pdfS.append(PDF.PdfFileReader(f)) + #check if decryption is required ; normally not required + if pdfS[-1].isEncrypted: pdfS[-1].decrypt('') + +#we assume that all the documents are commenting the same original document +for i in range(pdfO.numPages): + po=pdfO.getPage(i) + for pdfin in pdfS: + pdfO.addCommentsFromPage(i,pdfin.getPage(i)) + +pdfO.write(outFile) +if displayOutput: + os.startfile(outFile) diff --git a/samplecode/PDFComments2XL.py b/samplecode/PDFComments2XL.py new file mode 100644 index 000000000..805de539e --- /dev/null +++ b/samplecode/PDFComments2XL.py @@ -0,0 +1,165 @@ +#!/usr/bin/python3 +""" + test/demo program tha extract comments from an pdf into a Excel + command line: + PDFComments2XL [-d] [-o output.xls] [input.pdf] + -d: open Excel output at the end of extraction + -o: prode the output Excel name/path ; if not present the file is created + in temp folder named "comments on **PDFfile**.xlsx" + if no parameters (mainly for idle test), the pdf filename is asked for +""" +from collections import OrderedDict +from datetime import datetime +import sys +import os +import pypdf as PDF; +from openpyxl import Workbook +from openpyxl.utils import get_column_letter + +import locale +locale.setlocale(locale.LC_ALL,locale.getdefaultlocale()[0]) + +def ListOutlines(pdfS,outl=None): + """ + provide as a list of the outlines as tuple Title,Page(0 based),Vertical position in % + """ + if outl is None: + lst=[('-',0,0),] + outl=pdfS.getOutlines() + else: + lst=[] + if isinstance(outl,list): + for k in outl: + lst+=ListOutlines(pdfS,k) + else: + try: + top=outl['/Top'] + except: + top=0 + try: + pp=pdfS.MyPages[outl.page.idnum] + lst.append((outl.title,pp[0],100.0*(1.0-float(top/pp[1])))) + except: + print("trouble with page idnum",outl.page.idnum) + return lst + +def ListAnnots(pdfS): + """ + provide as a list of the comments with the response saved in .irt_str field, the list is indexed with idnums + """ + lst=OrderedDict() + for pn in range(pdfS.numPages): + p=pdfS.getPage(pn) + try: + a=p.get('/Annots' ).getObject() + if not isinstance(a,list): a=[a] + for b in a: + o=b.getObject() + if o['/Subtype']=='/Text': + try: o['/P'] # le champs '/P' etant optionnel on le reconstruit... + except: + o.update({PDF.NameObject('/P'):p.indirectRef}) + o.irt={} + lst[b.idnum]=o + except: + pass + #copy the information into the original comment + for k,o in lst.items(): + if '/IRT' in o: + t=o['/Contents'] + if isinstance(t,bytes):t=t.replace(b'\r',b'\n').decode('unicode_escape') + lst[o.rawGet('/IRT').idnum].irt[o['/M']]=\ + '%s (%s):\n%s'%\ + (o['/T'],datetime.strptime(o['/M'][2:10],'%Y%m%d').strftime('%x'),t) + #concat all replied comments into one string to ease insertion later... + for o in lst.values(): + o.irt_str='\n'.join([o.irt[x] for x in sorted(o.irt.keys())]) + return lst + +def FindOutline(Outlines,pa,pe): + """ + provide the outline just above the position (of the comment) + """ + m=None + for o in Outlines: + if(o[1]=0: + del sys.argv[0] + +if len(sys.argv)==1: + print(globals()['__doc__']) + sys.argv.append(input("pdf file to scan:")) + +pdfS=PDF.PdfFileReader(sys.argv[-1]) + +if '-o' in sys.argv: + xlFile=sys.argv [sys.argv.index('-o')+1] +else: + tempFolder=os.environ['TEMP'].replace('\\','/') + if tempFolder[-1]!='/' : tempFolder+='/' + xlFile=tempFolder+"Comments on "+os.path.splitext(os.path.split(pdfS.filepath)[-1])[0]+'.xlsx' + +#prepare the destination workbook +wb = Workbook() +ws=wb.active +ws.append(('Page','Pos','Chapt','Originator','Comment','Answer')) +ws.column_dimensions[get_column_letter(0+1)].width=5 +ws.column_dimensions[get_column_letter(1+1)].width=5 +ws.column_dimensions[get_column_letter(2+1)].width=25 +ws.column_dimensions[get_column_letter(3+1)].width=15 +ws.column_dimensions[get_column_letter(4+1)].width=90 +ws.column_dimensions[get_column_letter(5+1)].width=90 + +#check if decryption is required +if pdfS.isEncrypted: pdfS.decrypt('') + +#MyPages will store the matching table page.idnum => pagenumer,page_height +pdfS.MyPages={} + +for i,p in enumerate(pdfS.pages): + pdfS.MyPages[p.indirectRef.idnum]=[i,p['/MediaBox'][3]] + +#extract the list of OutLines into MyOutlines +pdfS.MyOutlines=ListOutlines(pdfS) + +#extract the comments into MyAnnots +pdfS.MyAnnots=ListAnnots(pdfS) + + +#sort the comments in the order (Page, vertical position, date) +lst={} +for p in pdfS.MyAnnots.values(): + pp=pdfS.MyPages[p.rawGet("/P").idnum] + pc=100.0*(1.0-float(int(p['/Rect'][1])/pp[1])) + lst[(pp[0],pc,p['/M'])]=p + +#fill the xl sheet with the comments +for x in sorted(lst.keys()): + p=lst[x] + if '/IRT' in p: continue #the comments with IRT are already present in the original comment irt field, we can ignore this one + + #print(x[0],',',end='') + #print('%.0f %%'%pc,',',end='') + #print(FindOutline(pdfS.MyOutlines,x[0],x[1])[0],',',end='') + auth=p['/T'] + if isinstance(auth,bytes):auth=auth.decode('unicode_escape') + cont=p['/Contents'] + if isinstance(cont,bytes):cont=cont.replace(b'\r',b'\n').decode('unicode_escape') + #print(cont,',',end='') + if isinstance(p.irt_str,bytes):p.irt_str=p.irt_str.replace(b'\r',b'\n').decode('unicode_escape') + #print(p.irt_str) + + ws.append((pdfS.getPageLabel(x[0]) ,'%.0f %%'%pc,FindOutline(pdfS.MyOutlines,x[0],x[1])[0],auth,cont,p.irt_str)) + +#post insertion formating +for row in ws.iter_rows(): + for cell in row: + cell.alignment = cell.alignment.copy(wrapText=True,vertical='top') + +#save and open the file +wb.save(xlFile) +if ('-d' in sys.argv) or ('idlelib.run' in sys.modules): + os.startfile(xlFile)