Skip to content

How to Extract Fonts from a PDF

dothinking edited this page Apr 16, 2021 · 4 revisions

This script can be used to extract all fonts referenced by some page of a PDF.

from __future__ import print_function
import fitz
# Open the PDF
doc = fitz.open("some.pdf")
xref_visited = [] # memorize already processed font xrefs here

num = 0 # count the extracted fonts
for page in doc:
    fl = page.getFontList()                                # list of fonts of page
    for f in fl:
        xref = f[0]                                        # xref of font
        if xref in xref_visited:
            continue                                       # skip if already processed
        xref_visited.append(xref)                          # do not process a second time
        # extract font buffer
        basename, ext, _, buffer = doc.extractFont(xref)
        if ext != "n/a":                                   # is the font extractable?
            num += 1
            foutname = "%s-%i.%s" % (basename, xref, ext)  # build the filename
            fout = open(foutname, "wb")                    # and output the font
            fout.write(buffer)
            fout.close()
            print("extracted", foutname)

footer = "extracted %i font files from %s." % (num, doc.name)
footer_line = "-".ljust(len(footer), "-")

# output some protocol
print(footer_line)
print(footer)
print(footer_line)

# Close the PDF
doc.close()
Clone this wiki locally