from __future__ import absolute_import from fitz.fitz import * '''The following is a collection of commodity functions to simplify the use of fitz / PyMupdf. ''' #============================================================================== # A generalized function for extracting a page's text. # All that needs to be specified is a page object prviously created by the # loadPage() method of a document. #============================================================================== def GetText(page, output = "text"): '''Extracts a PDF page's text.\nParameters are:\npage: object from a previous loadPage()\noutput option: text, html, json or xml.\nOutput will be a string representing the output of the TextPage extraction methods extractText, extractHTML, extractJSON, or etractXML respectively. Default and silent choice in case of spec error is "text". ''' if not getattr(page, "parent", None): raise UserWarning("invalid page object provided to GetText") dl = fitz.DisplayList() dv = fitz.Device(dl) page.run(dv, fitz.Identity) ts = fitz.TextSheet() tp = fitz.TextPage() rect = page.bound() dl.run(fitz.Device(ts, tp), fitz.Identity, rect) if output.lower() == "json": return tp.extractJSON() elif output.lower() == "html": return tp.extractHTML() elif output.lower() == "xml": return tp.extractXML() return tp.extractText(basic = True) #============================================================================== # A generalized function for rendering a page's image. # All that needs to be specified is a page object previously created by the # loadPage() method of a document. #============================================================================== def GetPixmap(page, matrix = fitz.Identity, Colorspace = "RGB"): '''Creates a fitz.Pixmap object for a PDF page.\nParameters are:\npage: object from a previous loadPage()\nmatrix: a fitz.Matrix instance to specify required transformations. Defaults to fitz.Identity (no transformation).\nColorspace: text string to specify the required colour space (RGB, CMYK or GRAY). Default and silent choice in case of a spec error is "RGB". ''' if not getattr(page, "parent", None): raise UserWarning("invalid page object provided to GetPixmap") if Colorspace.upper() == "RGB": cs = fitz.CS_RGB elif Colorspace.upper() == "CMYK": cs = fitz.CS_CMYK else: cs = fitz.CS_GRAY rect = page.bound().transform(matrix) # scale page boundaries irect = rect.round() # integer rectangle representing it pix = fitz.Pixmap(fitz.Colorspace(cs), irect) # create an empty pixmap pix.clearWith(255) # clear it with color "white" dev = fitz.Device(pix) # create a "draw" device page.run(dev, matrix) # render the page return pix #============================================================================== # A function to collect all links of a PDF page. # All that needs to be specified is a page object previously created by the # loadPage() method of a document. #============================================================================== def GetLinks(page): '''Creates a list of all links contained in a PDF page.\nParameters are:\npage: object from a previous loadPage().\nThe returned list contains a Python dictionary for every link item found. Every dictionary contains the key "type" to specify the link type. The presence of other keys depends on this link type - see PyMuPDF's ducmentation for details.''' if not getattr(page, "parent", None): raise UserWarning("invalid page object provided to GetLinks") ln = page.loadLinks() links = [] while ln: if ln.dest.kind == fitz.LINK_URI: nl = {"type":"uri", "uri": ln.dest.uri} elif ln.dest.kind == fitz.LINK_GOTO: nl = {"type": "goto", "page":ln.dest.page} elif ln.dest.kind == fitz.LINK_GOTOR: nl = {"type": "gotor", "file":ln.dest.fileSpec, "page": ln.dest.page} elif ln.dest.kind == fitz.LINK_LAUNCH: nl = {"type": "launch", "file":ln.dest.fileSpec} elif ln.dest.kind == fitz.LINK_NAMED: nl = {"type": "named", "name":ln.dest.named} else: pass links.append(nl) ln = ln.next return links #============================================================================== # A function to collect all bookmarks of a PDF document in the form of a table # of contents. it is very similar to Document.getToC (or fitz.GetToC(doc)). # The difference is an additional entry for each outline, which specifies # any link destination information. For details what these are, see PyMuPDF's # documentation. #============================================================================== def GetExtendedToC(doc): '''Creates an extended table of contents for a given PDF document.\nParameters are:\ndoc: a document object created with fitz.Document.\nOutput is a Python list, where each entry consists of outline level, title, page number and link destination information. For details see PyMuPDF's documentation.''' def recurse(olItem, liste, lvl): while olItem: if olItem.title: title = olItem.title.decode("utf-8") else: title = u" " page = 0 link = {} if olItem.dest.kind == fitz.LINK_GOTO: page = olItem.dest.page + 1 elif olItem.dest.kind == fitz.LINK_GOTOR: link = {"type": "gotor", "file":olItem.dest.fileSpec, "page": olItem.dest.page} page = 0 elif olItem.dest.kind == fitz.LINK_LAUNCH: link = {"type": "launch", "file":olItem.dest.fileSpec} page = 0 elif olItem.dest.kind == fitz.LINK_URI: link = {"type":"uri", "uri": olItem.dest.uri} page = 0 elif olItem.dest.kind == fitz.LINK_NAMED: link = {"type": "named", "name":olItem.dest.named} page = 0 elif olItem.dest.kind == fitz.LINK_NONE: link = {"type": "none", "page":olItem.dest.page} liste.append([lvl, title, page, link]) if olItem.down: liste = recurse(olItem.down, liste, lvl+1) olItem = olItem.next return liste if not getattr(doc, "authenticate", None): raise UserWarning("invalid document object provided to GetExtendedToC") if hasattr(doc, "outline"): olItem = doc.outline else: raise ValueError("document is still encrypted") if not olItem: return [] lvl = 1 liste = [] return recurse(olItem, liste, lvl) #================================================================ # Function: Table of Contents #================================================================ def GetToC(doc): def recurse(olItem, liste, lvl): while olItem: if olItem.title: title = olItem.title.decode("utf-8") else: title = u" " if olItem.dest.kind == 1: page = olItem.dest.page + 1 else: page = 0 liste.append([lvl, title, page]) if olItem.down: liste = recurse(olItem.down, liste, lvl+1) olItem = olItem.next return liste if not getattr(doc, "authenticate", None): raise UserWarning("invalid document object provided to GetToC") if hasattr(doc, "outline"): olItem = doc.outline else: raise ValueError("document is still encrypted") if not olItem: return [] lvl = 1 liste = [] return recurse(olItem, liste, lvl)