diff --git a/pypdf/__init__.py b/pypdf/__init__.py index 5e2e06925..2acfad672 100644 --- a/pypdf/__init__.py +++ b/pypdf/__init__.py @@ -1,16 +1,12 @@ -from .pdf import PdfFileReader, PdfFileWriter -from .generic import * -from .merger import PdfFileMerger -from .pagerange import PageRange -from ._version import __version__ +from pypdf.pdf import PdfFileReader, PdfFileWriter +from pypdf.merger import PdfFileMerger +from pypdf.pagerange import PageRange +from pypdf._version import __version__ __all__ = [ # Basic PyPDF elements "PdfFileReader", "PdfFileWriter", "PdfFileMerger", "PageRange", - # most used elements from generic - "BooleanObject","ArrayObject","IndirectObject","FloatObject","NumberObject","createStringObject", - "TextStringObject","NameObject","DictionaryObject","TreeObject","Destination","PageLabel","Bookmark", # PyPDF modules "pdf", "generic", "utils", "filters", "merger", "pagerange", "xmp" ] diff --git a/pypdf/_version.py b/pypdf/_version.py index c30e09392..e2ba8ba3b 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = '1.27.0PPzz' +__version__ = '1.27.0' diff --git a/pypdf/filters.py b/pypdf/filters.py index 5fb5149a0..8459c0132 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -1,3 +1,4 @@ +# -*- coding: UTF-8 -*- # vim: sw=4:expandtab:foldmethod=marker # # Copyright (c) 2006, Mathieu Fenniak @@ -30,14 +31,13 @@ Implementation of stream filters for PDF. """ -import math import base64 import struct from sys import version_info -from . import generic -from .generic import * -from .utils import PdfReadError, pypdfOrd, paethPredictor,PdfStreamError +from pypdf import generic +from pypdf.generic import * +from pypdf.utils import PdfReadError, pypdfOrd, paethPredictor, PdfStreamError try: import zlib @@ -193,7 +193,7 @@ def decode(data, decodeParms=None): prev_rowdata = rowdata - for d in rowdata[1:]: ##ppZZ ???? err in latest version + for d in rowdata: if version_info < (3, 0): output.write(chr(d)) else: diff --git a/pypdf/generic.py b/pypdf/generic.py index 38d525ee7..5e5555da3 100644 --- a/pypdf/generic.py +++ b/pypdf/generic.py @@ -38,9 +38,8 @@ import warnings from io import BytesIO -#from . import utils -from .utils import * -from .utils import pypdfUnicode as u_, pypdfBytes as b_ +from pypdf.utils import * +from pypdf.utils import pypdfBytes as b_, pypdfUnicode as u_ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" @@ -99,10 +98,6 @@ def getObject(self): """Resolves indirect references.""" return self - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - raise Exception("clone PdfObject") - return self # TO-DO Add __repr_() implementations to the *Object classes class NullObject(PdfObject): @@ -119,19 +114,10 @@ def readFromStream(stream): return NullObject() - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - return NullObject() - - class BooleanObject(PdfObject): def __init__(self, value): self.value = value - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - return BooleanObject(self.value) - def writeToStream(self, stream, encryption_key): if self.value: stream.write(b_("true")) @@ -153,17 +139,6 @@ def readFromStream(stream): class ArrayObject(list, PdfObject): - - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - arr = ArrayObject() - for data in self: - if 'clone' in dir(data): - arr.append(data.clone(pdfD)) - else: - arr.append(data) - return arr - def writeToStream(self, stream, encryption_key): stream.write(b_("[")) @@ -221,22 +196,6 @@ def __init__(self, idnum, generation, pdf): self.generation = generation self.pdf = pdf - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - try: pdfD._IdTranslated - except: - pdfD._IdTranslated={} - try: - n=pdfD._IdTranslated[self.idnum] - except: - n=len(pdfD._objects)+1 - pdfD._IdTranslated[self.idnum]=n - pdfD._objects.append("%d NotInit"%n) - o=self.getObject().clone(pdfD) - pdfD._objects[n-1]=o - - return IndirectObject(n,0,pdfD) - def getObject(self): return self.pdf.getObject(self).getObject() @@ -306,10 +265,6 @@ def __new__(cls, value="0", context=None): except: return decimal.Decimal.__new__(cls, str(value)) - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - return FloatObject(self.asNumeric()) - def __repr__(self): if self == self.to_integral(): return str(self.quantize(decimal.Decimal(1))) @@ -339,10 +294,6 @@ def __new__(cls, value): except OverflowError: return int.__new__(cls, 0) - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - return NumberObject(self.asNumeric()) - def asNumeric(self): return int(b_(repr(self))) @@ -493,10 +444,6 @@ class ByteStringObject(bytes_type, PdfObject): # returns self. original_bytes = property(lambda self: self) - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - return ByteStringObject(self) - def writeToStream(self, stream, encryption_key): bytearr = self @@ -524,10 +471,6 @@ class TextStringObject(string_type, PdfObject): # back-calculate what the original encoded bytes were. original_bytes = property(lambda self: self.getOriginalBytes()) - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - return createStringObject(self) - def getOriginalBytes(self): # We're a text string object, but the library is trying to get our raw # bytes. This can happen if we auto-detected this string as text, but @@ -570,10 +513,6 @@ class NameObject(str, PdfObject): delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) surfix = b_("/") - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - return NameObject(self) - def writeToStream(self, stream, encryption_key): stream.write(b_(self)) @@ -610,15 +549,6 @@ def readFromStream(stream, pdf): class DictionaryObject(dict, PdfObject): - - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - d=DictionaryObject() - for k,v in self.items(): - d.update({(k.clone(k) if 'clone' in dir(k) else k): - (v.clone(pdfD) if 'clone' in dir(v) else v) }) - return d - def rawGet(self, key): return dict.__getitem__(self, key) @@ -804,15 +734,7 @@ def readFromStream(stream, pdf): class TreeObject(DictionaryObject): def __init__(self): - DictionaryObject.__init__(self) - - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - raise Exception("clone TreeObject",self) - obj=TreeObject() - for k,v in self.items(): - obj.addChild(v.clone(pdfD),pdfD) - return obj + DictionaryObject.__init__() def hasChildren(self): return '/First' in self @@ -953,14 +875,6 @@ def __init__(self): self._data = None self.decodedSelf = None - def clone(self,pdfD): #PPzz - """ clone object into pdfD """ - st=self.__class__() - st._data=self._data - st.decodedSelf=self.decodedSelf - st.update(self) - return self - def writeToStream(self, stream, encryption_key): self[NameObject("/Length")] = NumberObject(len(self._data)) DictionaryObject.writeToStream(self, stream, encryption_key) @@ -992,7 +906,7 @@ def initializeFromDictionary(data): return retval def flateEncode(self): - from .filters import FlateCodec + from pypdf.filters import FlateCodec if "/Filter" in self: f = self["/Filter"] @@ -1020,7 +934,7 @@ def __init__(self): self.decodedSelf = None def getData(self): - from .filters import decodeStreamData + from pypdf.filters import decodeStreamData if self.decodedSelf: # Cached version of decoded object @@ -2184,7 +2098,6 @@ def __init__(self, title, page, typ, *args): self[NameObject("/Title")] = title self[NameObject("/Page")] = page self[NameObject("/Type")] = typ - self.parent=None #PPzz # from table 8.2 of the PDF 1.7 reference. if typ == "/XYZ": @@ -2282,106 +2195,6 @@ def writeToStream(self, stream, encryption_key): :rtype: ``int``, or ``None`` if not available. """ -class PageLabel(): - def __init__(self,pn=0,defObject=None): - """ - :param - integer pn: 1st Page of the group - defObject: tuple (1stPage,prefix,increment) or DictionnaryObject from the file - """ - - if defObject is None: - defObject = DictionaryObject() - - try: - if type(defObject) != tuple: - self.prefix=defObject['/P'] - else: - self.prefix=defObject[1]+""#None will induce and error and reach default value - except: - self.prefix='' - - try: - if type(defObject) != tuple: - self.numbering=defObject['/S'] - else: - self.numbering=defObject[2]+""#None will induce and error and reach default value - except: - self.numbering='/D' if self.prefix == "" else "" - - self.pn=pn #1st page of the range - try: - if type(defObject) != tuple: - self.first=int(defObject['/St'])-pn - else: - self.first=max(1,int(defObject[0]))-pn #None will induce and error and reach default value - except: - self.first=1-pn - - def __repr__(self): - return "PageLabel Obj(@%r :%s-%s)" % (self.first, self.prefix, self.numbering) - - def buildDefinition(self,pn=None): - """ - build the DictionnaryObjecgt to inject into the PDF - """ - o=DictionaryObject() - if self.numbering!='/D' or self.prefix!='': - o.update({ NameObject("/S"):NameObject(self.numbering) }) - if self.prefix!='': - o.update({ NameObject("/P"):NameObject(self.prefix) }) - if pn==None: - o.update({ NameObject("/St"):NumberObject(self.first+self.pn) }) - elif pn==0: - pass; #No start value - else: - o.update({ NameObject("/St"):NumberObject(pn) }) - return o - - def getLabel(self,pn): - def int_to_Roman(num): - val = [ - 1000, 900, 500, 400, - 100, 90, 50, 40, - 10, 9, 5, 4, - 1 - ] - syb = [ - "M", "CM", "D", "CD", - "C", "XC", "L", "XL", - "X", "IX", "V", "IV", - "I" - ] - roman_num = '' - i = 0 - while num > 0: - for _ in range(num // val[i]): - roman_num += syb[i] - num -= val[i] - i += 1 - return roman_num - - def int_to_Alpha(num): - t="" - while(num>0): - num=num-1 - t=chr(num%26+65)+t - num=num//26 - return t - if self.numbering=='/D': - st=str(pn+self.first) - elif self.numbering=='/R': - st=int_to_Roman(pn+self.first) - elif self.numbering=='/r': - st=int_to_Roman(pn+self.first).lower() - elif self.numbering=='/A': - st=int_to_Alpha(pn+self.first) - elif self.numbering=='/a': - st=int_to_Alpha(pn+self.first).lower() - else: - st='' - return self.prefix+st - class Bookmark(Destination): def writeToStream(self, stream, encryption_key): diff --git a/pypdf/merger.py b/pypdf/merger.py index 39a325d73..8a029ea22 100644 --- a/pypdf/merger.py +++ b/pypdf/merger.py @@ -207,12 +207,9 @@ def append(self, fileobj, bookmark=None, pages=None, importBookmarks=True): """ self.merge(len(self._pages), fileobj, bookmark, pages, importBookmarks) - def write(self, fileobj=None): + def write(self): """ Writes all data that has been merged to the given output file. - - :param fileobj: Output file. Can be a filename or any kind of - file-like object. """ for page in self._pages: self._writer.addPage(page.pagedata) @@ -225,7 +222,7 @@ def write(self, fileobj=None): self._writeBookmarks() # Write the output to the file - self._writer.write(fileobj) + self._writer.write() def close(self): """ diff --git a/pypdf/pagerange.py b/pypdf/pagerange.py index cdf5e7acb..f8296d8de 100644 --- a/pypdf/pagerange.py +++ b/pypdf/pagerange.py @@ -8,7 +8,8 @@ """ import re -from .utils import isString + +from pypdf.utils import isString _INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0". PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE) diff --git a/pypdf/pdf.py b/pypdf/pdf.py index c52b10f3b..95a2260c2 100644 --- a/pypdf/pdf.py +++ b/pypdf/pdf.py @@ -40,17 +40,14 @@ import random import struct import time -import datetime -import sys import uuid from hashlib import md5 -from types import MethodType from sys import version_info -from . import utils -from .generic import * -from .utils import * -from .utils import pypdfBytes as b_ +from pypdf import utils +from pypdf.generic import * +from pypdf.utils import * +from pypdf.utils import pypdfBytes as b_ if version_info < (3, 0): from cStringIO import StringIO @@ -58,8 +55,6 @@ else: from io import StringIO, BytesIO -import warnings -import codecs __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" @@ -68,7 +63,7 @@ class PdfFileWriter(object): - def __init__(self, stream=None, pdfReaderAsSource=None,debug=False): + def __init__(self, stream, debug=False): """ This class supports writing PDF files out, given pages produced by another class (typically :class:`PdfFileReader`). @@ -76,8 +71,6 @@ def __init__(self, stream=None, pdfReaderAsSource=None,debug=False): :param stream: File-like object or path to a PDF file in ``str`` format. If of the former type, the object must support the ``write()`` and the ``tell()`` methods. - :para pdfReaderAsSource: pdfFileReader object : - if passed, it is cloned into the writer :param bool debug: Whether this class should emit debug informations (recommended for development). Defaults to False. """ @@ -97,15 +90,6 @@ def __init__(self, stream=None, pdfReaderAsSource=None,debug=False): "written to correctly." % self._stream.name ) - # to be done before cloning alternative - self._flattenPageLabels=None - # copy compatible methods from Reader - self.getPageLabel=MethodType(PdfFileReader.getPageLabel,self) - - if isinstance(pdfReaderAsSource,PdfFileReader): - self.clone(pdfReaderAsSource) - return - # The root of our page tree node. pages = DictionaryObject() pages.update({ @@ -174,14 +158,6 @@ def isClosed(self): """ return not bool(self._stream) or self._stream.closed - def clone(self,pdfR): #ppZZ - self._IdTranslated={} - tr=pdfR._trailer.clone(self) - self._pages=tr['/Root'].rawGet('/Pages') - self._info=tr.rawGet('/Info') - self._rootObject=tr['/Root'] - self._root=tr.rawGet('/Root') - def _addObject(self, obj): self._objects.append(obj) @@ -193,29 +169,16 @@ def getObject(self, ido): return self._objects[ido.idnum - 1] - def _insertPage(self, page, pageNumber): - assert page["/Type"] == "/Page" - pn=self._pages.getObject()['/Count'] - if pageNumber>=pn: - nextPage,firstPageNum=self._getPage(pn-1,self._pages,0) - pageNumber=pn - else: - nextPage,firstPageNum=self._getPage(pageNumber,self._pages,0) - - nextPage=nextPage.getObject() - pages=nextPage['/Parent'] - pp=nextPage.rawGet('/Parent') - page[NameObject("/Parent")] = pp - pages["/Kids"].insert(pageNumber-firstPageNum,self._addObject(page)) - - while pp is not None: - pp1=pp.getObject() - pp1[NameObject("/Count")] = NumberObject(pp1["/Count"] + 1) - if pp==self._pages: - pp=None - else: - pp=pp1.rawGet("/Parent") - + def _addPage(self, page, action): + if page["/Type"] != "/Page": + raise ValueError("Page type is not /Page") + + page[NameObject("/Parent")] = self._pages + pages = self.getObject(self._pages) + action(pages["/Kids"], self._addObject(page)) + + pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) + def addPage(self, page): """ Adds a page to this PDF file. The page is usually acquired from a @@ -224,7 +187,7 @@ def addPage(self, page): :param PageObject page: The page to add to the document. Should be an instance of :class:`PageObject` """ - self.insertPage(page,self._pages.getObject()["/Count"]) + self._addPage(page, list.append) def insertPage(self, page, index=0): """ @@ -235,88 +198,20 @@ def insertPage(self, page, index=0): should be an instance of :class:`PageObject`. :param int index: Position at which the page will be inserted. """ - assert 0<=index<=self._pages.getObject()["/Count"] - self._insertPage(page,index) - - def _getPage(self,pageNum,node,firstPageNum): #ppZZ - """ - internal - :param int pageNum: page searched for - :param IndirectObject node: point to a page or pages within the page tree - :param int firstPageNum: page number of the first page of the tree below - - """ - if node.getObject()['/Type'] == '/Page': # it is only one page we have to check - if pageNum == firstPageNum : - return node,-1 #One Page is not a group, we have to return -1 in order to return the 1st page number of the group - else: - return firstPageNum+1,-1 # return next first page number - elif firstPageNum <= pageNum ` """ - assert 0<=pageNumber to write to is not in binary mode. It may not be written to correctly." % stream.name) - - if stream is not None: - savedStream=self._stream - self._stream=stream - try: - self.write() - finally: - self._stream=savedStream - return - if not self._root: self._root = self._addObject(self._rootObject) @@ -878,10 +748,6 @@ def getReference(self, obj): return ref - def getIndirectObject(self, idnum): #ppZZ - ref = IndirectObject(idnum, 0, self) - return ref - def getOutlineRoot(self): if '/Outlines' in self._rootObject: outline = self._rootObject['/Outlines'] @@ -897,64 +763,47 @@ def getOutlineRoot(self): return outline - - #Copied from Reader - def _buildDestination(self, title, array): - return Destination(title, array[0], array[1], *array[2:]) - - def getNamedDestinations(self, tree=None, retval=None): - """ - Retrieves the named destinations present in the document. - - :return: a dictionary which maps names to - :class:`Destinations`. - :rtype: dict - """ - if retval is None: - retval = {} - catalog = self._rootObject - - # get the name tree - if "/Dests" in catalog: - tree = catalog["/Dests"] - elif "/Names" in catalog: - names = catalog['/Names'] - if "/Dests" in names: - tree = names['/Dests'] - - if tree is None: - return retval + def getNamedDestRoot(self): + if '/Names' in self._rootObject and \ + isinstance(self._rootObject['/Names'], DictionaryObject): + names = self._rootObject['/Names'] + idnum = self._objects.index(names) + 1 + namesRef = IndirectObject(idnum, 0, self) - if "/Kids" in tree: - # recurse down the tree - for kid in tree["/Kids"]: - self.getNamedDestinations(kid.getObject(), retval) + assert namesRef.getObject() == names - elif "/Names" in tree: #ppZZ if => elif - names = tree["/Names"] - for i in range(0, len(names), 2): - key = names[i].getObject() - val = names[i+1].getObject() + if '/Dests' in names and \ + isinstance(names['/Dests'], DictionaryObject): + dests = names['/Dests'] + idnum = self._objects.index(dests) + 1 + destsRef = IndirectObject(idnum, 0, self) - if isinstance(val, DictionaryObject) and '/D' in val: - val = val['/D'] + assert destsRef.getObject() == dests - dest = self._buildDestination(key, val) - if dest is not None: - retval[key] = dest - else: # case where Dests is in root catalog - for k,v in tree.items(): - val=v.getObject() - if isinstance(val, DictionaryObject) and '/D' in val: - val = val['/D'] - dest = self._buildDestination(k,val) - if dest != None: - retval[k] = dest + if '/Names' in dests: + nd = dests['/Names'] + else: + nd = ArrayObject() + dests[NameObject('/Names')] = nd + else: + dests = DictionaryObject() + destsRef = self._addObject(dests) + names[NameObject('/Dests')] = destsRef + nd = ArrayObject() + dests[NameObject('/Names')] = nd - return retval + else: + names = DictionaryObject() + namesRef = self._addObject(names) + self._rootObject[NameObject('/Names')] = namesRef + dests = DictionaryObject() + destsRef = self._addObject(dests) + names[NameObject('/Dests')] = destsRef + nd = ArrayObject() + dests[NameObject('/Names')] = nd + return nd - #bookmarks are added in def addBookmarkDestination(self, dest, parent=None): destRef = self._addObject(dest) @@ -995,7 +844,8 @@ def addBookmarkDict(self, bookmark, parent=None): def addBookmark( self, title, pagenum, parent=None, color=None, bold=False, - italic=False, fit='/Fit', *args): + italic=False, fit='/Fit', *args + ): """ Add a bookmark to this PDF file. @@ -1010,7 +860,7 @@ def addBookmark( :param str fit: The fit of the destination page. See :meth:`addLink()` for details. """ - pageRef = self.getPage(pagenum,True) + pageRef = self.getObject(self._pages)['/Kids'][pagenum] action = DictionaryObject() zoomArgs = [] @@ -1059,303 +909,40 @@ def addBookmark( return bookmarkRef - def addNamedDestinationObject(self, dest,title=None): - def _getMinMaxKey(node,_min=True): - if "/Names" in node: - return node["/Names"][0 if _min else -2] - elif "/Kids" in node: - return _getMinMaxKey(node["/Kids"][0 if _min else -1].getObject(),_min) - else: - raise Exception("_getMinMaxKey abnormal") - - def _insertNamedDest(title,dest,node,force=0): - if "/Limits" in node: - mi,ma=node['/Limits'][0:2] - elif ("/Kids" in node and len(node["/Kids"]) == 0) : - raise Exception("Kids list empty ???") - elif ("/Names" in node and len(node["/Names"]) == 0): - title=TextStringObject(title) - node['/Names'].append(title) - node['/Names'].append(dest) - node.update({NameObject('/Limits'):ArrayObject([title,title])}) - return node['/Limits'] - else: #there is some data but no Limits(it should not exists - mi=_getMinMaxKey(node,True) - ma=_getMinMaxKey(node,False) - - if "/Names" in node: #it is a list of names - if titlenode['/Limits'][1]: - node['/Limits'][1]=title - return node['/Limits'] - else: - return None - elif "/Kids" in node: #need to process one level down - if force == 1: - lim=_insertNamedDest(title,dest,node['/Kids'][-1].getObject(),+1) - if '/Limits' not in node: node.update({ NameObject('/Limits') : ArrayObject([ mi, lim[1] ]) }) - node['/Limits'][1]=lim[1] - return node['/Limits'] - elif title=0 - - def removeAnnots(self,pageSet=None,links=False,comments=False,attachments=False,prints=False,_3D=False): - """ - Removes different annotations from this output. - """ - if pageSet is None: - pageSet=range(self.numPages) - - #if all are false, for compatibility, they should be all deleted - if not(links or comments or attachments or prints or _3D): - links=True - comments=True - attachments=True - prints=True - _3D=True - subTypes=[] - if links: - subTypes.extend(['/Link',]) - if comments: - subTypes.extend(['/Text','/FreeText','/Line','/Square','/Circle','/Polygon','/PolyLine',\ - '/Highlight','/Underline','/Squiggly','/StrikeOut','/Stamp','/Caret',\ - '/Ink','/Popup',]) - if attachments: - subTypes.extend(['/FileAttachment','/Sound','/Movie','/Widget','/Screen',]) - if prints: - subTypes.extend(['/PrinterMark','/TrapNet','/Watermark',]) - if _3D: - subTypes.extend(['/3D']) - - for i in pageSet: - page = self.getPage(i) - if "/Annots" in page: - ik=0 - while iknode['/Limits'][1]: - node['/Limits'][1]=pn - return node['/Limits'] - else: - return None - elif "/Kids" in node: #need to process one level down - if force == 1: - lim=_insertPageLabel(pn,pagelbl,node['/Kids'][-1].getObject(),+1) - if '/Limits' not in node: node.update({ NameObject('/Limits') : ArrayObject([ mi, lim[1] ]) }) - node['/Limits'][1]=lim[1] - return node['/Limits'] - elif pn elif + if "/Names" in tree: names = tree["/Names"] for i in range(0, len(names), 2): key = names[i].getObject() @@ -2304,12 +1671,6 @@ def getNamedDestinations(self, tree=None, retval=None): dest = self._buildDestination(key, val) if dest is not None: retval[key] = dest - else: # case where Dests is in root catalog - for k,v in tree.items(): - val=v.getObject() - dest = self._buildDestination(k,val) - if dest != None: - retval[k] = dest return retval @@ -2322,7 +1683,7 @@ def getOutlines(self, node=None, outlines=None): """ if outlines is None: outlines = [] - catalog = self._rootObject + catalog = self._trailer["/Root"] # get the outline dictionary and named destinations if "/Outlines" in catalog: @@ -2434,19 +1795,6 @@ def _buildOutline(self, node): outline[NameObject("/Title")] = title else: raise PdfReadError("Unexpected destination %r" % dest) - - #ppZZ : add parent - outline.parent=None - if "/Parent" in node: - p=node["/Parent"].getObject() - try: - if "/Type" in p and p["/Type"] == '/Outlines': - outline.parent=None - elif "/Title" in p and p["/Title"] != '': - outline.parent=node["/Parent"] - except: - pass - return outline pages = property( @@ -2460,51 +1808,6 @@ def _buildOutline(self, node): :meth:`getPage()` methods. """ - def getPageLabel(self,num): - def findPageLblEntry(num): - #there will be always 0 that will match... - k1=-.5 - for k in sorted(self._flattenPageLabels.keys()): - if k>num: break - k1=k - - if num!=k: - k=k1 - return self._flattenPageLabels[k].getLabel(num) - - def flattenPageLabel(node=None): - flat={} - """ - the default value we use this value in order to have a - default value that will be overriden by 0 if provided and - if we want to check that there is a definition for page 0 - """ - flat[-0.5]=PageLabel(0,(0,'','/D')) - if node is None: - p1=self._rootObject - if "/PageLabels" in p1: - node=p1["/PageLabels"] - else: - return flat - if '/Nums' in node: - node=node['/Nums'].getObject() - for i in range(len(node)//2): - o=PageLabel(node[2*i],node[2*i+1].getObject()) - flat[node[2*i]]=o - elif '/Kids' in node: - for k in node['/Kids']: - flat.update(flattenPageLabel(k.getObject())) - else: - raise Exception("issue processing PageLabels") - return flat - - if self._flattenPageLabels is None: - self._flattenPageLabels =flattenPageLabel() - assert 0 <= num < self.numPages,"Page Number out of range" - return findPageLblEntry(num) - - - @property def pageLayout(self): """ @@ -2544,7 +1847,7 @@ def _flatten(self, pages=None, inherit=None, indirectRef=None): inherit = dict() if pages is None: self._flattenedPages = [] - catalog = self._rootObject + catalog = self._trailer["/Root"].getObject() pages = catalog["/Pages"].getObject() t = "/Pages" @@ -3007,10 +2310,8 @@ def usedBefore(num, generation): if "/XRefStm" in newTrailer: startxref = newTrailer["/XRefStm"] - del self._trailer["/XRefStm"] #to ensure there will be no loops elif "/Prev" in newTrailer: startxref = newTrailer["/Prev"] - del self._trailer["/Prev"] #to ensure there will be no loops else: break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream @@ -3097,16 +2398,14 @@ def usedBefore(num, generation): 1, xrefstreamOffset, xrefstmGen ) - trailerKeys = ("/Root", "/Encrypt", "/Info", "/ID","/Prev") + trailerKeys = ("/Root", "/Encrypt", "/Info", "/ID") for key in trailerKeys: if key in xrefstream and key not in self._trailer: self._trailer[NameObject(key)] = xrefstream.rawGet(key) - #based on other software, the Previous Prev shall also be processed... - if "/Prev" in self._trailer: ##ppZZ : /Prev was collected/updated before - startxref = self._trailer["/Prev"] - del self._trailer["/Prev"] #to ensure there will be no loops + if "/Prev" in xrefstream: + startxref = xrefstream["/Prev"] else: break else: @@ -3362,12 +2661,6 @@ def _authenticateUserPassword(self, password): def isEncrypted(self): return "/Encrypt" in self._trailer - def getIndirectObject(self, idnum): #ppZZ - ref = IndirectObject(idnum, 0, self) - return ref - - - def _convertToInt(d, size): if size > 8: diff --git a/samplecode/MergingComments.py b/samplecode/MergingComments.py deleted file mode 100644 index 1c4072fa5..000000000 --- a/samplecode/MergingComments.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/python3 -""" - test/demo program that copy alll comments from multiples pdf into one command line: - PDFCommentsMerge [-d] [-o output.pdf] [input1.pdf] ... [inputN.pdf] - -d: open Excel output at the end of extraction - -o: prode the output Excel name/path ; if not present the file is created - in temp folder named "FullCommented **input1**.pdf" - if no parameters (mainly for idle test), the pdf filenames re asked for - empty to finish -""" -import sys -import os -import pypdf as PDF; - - -if sys.argv[0].upper().find("PYTHON.EXE")>=0: - del sys.argv[0] -del sys.argv[0] # to ignore called program - -displayOutput=('-d' in sys.argv) or ('idlelib.run' in sys.modules) -try: - del sys.argv[sys.argv.index('-d')] -except: - pass - - -if (len(sys.argv)==0) or (('-o' in sys.argv) and (len(sys.argv)<=2)) : - print(globals()['__doc__']) - while True: - t=input("pdf file to scan:") - if t=='':break - sys.argv.append(t) - -if '-o' in sys.argv: - i=sys.argv.index('-o') - outFile=sys.argv[i+1] - del sys.argv[i] - del sys.argv[i] -else: - tempFolder=os.environ['TEMP'].replace('\\','/') - if tempFolder[-1]!='/' : tempFolder+='/' - outFile=tempFolder+"FullCommented "+os.path.splitext(os.path.split(sys.argv[0])[-1])[0]+'.pdf' - -pdfO=PDF.PdfFileWriter(None,PDF.PdfFileReader(sys.argv[0])) -del sys.argv[0] - -pdfS=[] -for f in sys.argv: - pdfS.append(PDF.PdfFileReader(f)) - #check if decryption is required ; normally not required - if pdfS[-1].isEncrypted: pdfS[-1].decrypt('') - -#we assume that all the documents are commenting the same original document -for i in range(pdfO.numPages): - po=pdfO.getPage(i) - for pdfin in pdfS: - pdfO.addCommentsFromPage(i,pdfin.getPage(i)) - -pdfO.write(outFile) -if displayOutput: - os.startfile(outFile) diff --git a/samplecode/PDFComments2XL.py b/samplecode/PDFComments2XL.py deleted file mode 100644 index 805de539e..000000000 --- a/samplecode/PDFComments2XL.py +++ /dev/null @@ -1,165 +0,0 @@ -#!/usr/bin/python3 -""" - test/demo program tha extract comments from an pdf into a Excel - command line: - PDFComments2XL [-d] [-o output.xls] [input.pdf] - -d: open Excel output at the end of extraction - -o: prode the output Excel name/path ; if not present the file is created - in temp folder named "comments on **PDFfile**.xlsx" - if no parameters (mainly for idle test), the pdf filename is asked for -""" -from collections import OrderedDict -from datetime import datetime -import sys -import os -import pypdf as PDF; -from openpyxl import Workbook -from openpyxl.utils import get_column_letter - -import locale -locale.setlocale(locale.LC_ALL,locale.getdefaultlocale()[0]) - -def ListOutlines(pdfS,outl=None): - """ - provide as a list of the outlines as tuple Title,Page(0 based),Vertical position in % - """ - if outl is None: - lst=[('-',0,0),] - outl=pdfS.getOutlines() - else: - lst=[] - if isinstance(outl,list): - for k in outl: - lst+=ListOutlines(pdfS,k) - else: - try: - top=outl['/Top'] - except: - top=0 - try: - pp=pdfS.MyPages[outl.page.idnum] - lst.append((outl.title,pp[0],100.0*(1.0-float(top/pp[1])))) - except: - print("trouble with page idnum",outl.page.idnum) - return lst - -def ListAnnots(pdfS): - """ - provide as a list of the comments with the response saved in .irt_str field, the list is indexed with idnums - """ - lst=OrderedDict() - for pn in range(pdfS.numPages): - p=pdfS.getPage(pn) - try: - a=p.get('/Annots' ).getObject() - if not isinstance(a,list): a=[a] - for b in a: - o=b.getObject() - if o['/Subtype']=='/Text': - try: o['/P'] # le champs '/P' etant optionnel on le reconstruit... - except: - o.update({PDF.NameObject('/P'):p.indirectRef}) - o.irt={} - lst[b.idnum]=o - except: - pass - #copy the information into the original comment - for k,o in lst.items(): - if '/IRT' in o: - t=o['/Contents'] - if isinstance(t,bytes):t=t.replace(b'\r',b'\n').decode('unicode_escape') - lst[o.rawGet('/IRT').idnum].irt[o['/M']]=\ - '%s (%s):\n%s'%\ - (o['/T'],datetime.strptime(o['/M'][2:10],'%Y%m%d').strftime('%x'),t) - #concat all replied comments into one string to ease insertion later... - for o in lst.values(): - o.irt_str='\n'.join([o.irt[x] for x in sorted(o.irt.keys())]) - return lst - -def FindOutline(Outlines,pa,pe): - """ - provide the outline just above the position (of the comment) - """ - m=None - for o in Outlines: - if(o[1]=0: - del sys.argv[0] - -if len(sys.argv)==1: - print(globals()['__doc__']) - sys.argv.append(input("pdf file to scan:")) - -pdfS=PDF.PdfFileReader(sys.argv[-1]) - -if '-o' in sys.argv: - xlFile=sys.argv [sys.argv.index('-o')+1] -else: - tempFolder=os.environ['TEMP'].replace('\\','/') - if tempFolder[-1]!='/' : tempFolder+='/' - xlFile=tempFolder+"Comments on "+os.path.splitext(os.path.split(pdfS.filepath)[-1])[0]+'.xlsx' - -#prepare the destination workbook -wb = Workbook() -ws=wb.active -ws.append(('Page','Pos','Chapt','Originator','Comment','Answer')) -ws.column_dimensions[get_column_letter(0+1)].width=5 -ws.column_dimensions[get_column_letter(1+1)].width=5 -ws.column_dimensions[get_column_letter(2+1)].width=25 -ws.column_dimensions[get_column_letter(3+1)].width=15 -ws.column_dimensions[get_column_letter(4+1)].width=90 -ws.column_dimensions[get_column_letter(5+1)].width=90 - -#check if decryption is required -if pdfS.isEncrypted: pdfS.decrypt('') - -#MyPages will store the matching table page.idnum => pagenumer,page_height -pdfS.MyPages={} - -for i,p in enumerate(pdfS.pages): - pdfS.MyPages[p.indirectRef.idnum]=[i,p['/MediaBox'][3]] - -#extract the list of OutLines into MyOutlines -pdfS.MyOutlines=ListOutlines(pdfS) - -#extract the comments into MyAnnots -pdfS.MyAnnots=ListAnnots(pdfS) - - -#sort the comments in the order (Page, vertical position, date) -lst={} -for p in pdfS.MyAnnots.values(): - pp=pdfS.MyPages[p.rawGet("/P").idnum] - pc=100.0*(1.0-float(int(p['/Rect'][1])/pp[1])) - lst[(pp[0],pc,p['/M'])]=p - -#fill the xl sheet with the comments -for x in sorted(lst.keys()): - p=lst[x] - if '/IRT' in p: continue #the comments with IRT are already present in the original comment irt field, we can ignore this one - - #print(x[0],',',end='') - #print('%.0f %%'%pc,',',end='') - #print(FindOutline(pdfS.MyOutlines,x[0],x[1])[0],',',end='') - auth=p['/T'] - if isinstance(auth,bytes):auth=auth.decode('unicode_escape') - cont=p['/Contents'] - if isinstance(cont,bytes):cont=cont.replace(b'\r',b'\n').decode('unicode_escape') - #print(cont,',',end='') - if isinstance(p.irt_str,bytes):p.irt_str=p.irt_str.replace(b'\r',b'\n').decode('unicode_escape') - #print(p.irt_str) - - ws.append((pdfS.getPageLabel(x[0]) ,'%.0f %%'%pc,FindOutline(pdfS.MyOutlines,x[0],x[1])[0],auth,cont,p.irt_str)) - -#post insertion formating -for row in ws.iter_rows(): - for cell in row: - cell.alignment = cell.alignment.copy(wrapText=True,vertical='top') - -#save and open the file -wb.save(xlFile) -if ('-d' in sys.argv) or ('idlelib.run' in sys.modules): - os.startfile(xlFile)