Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Added command update-offsets to adjust offsets and lengths. #15

Merged
merged 22 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
b18c8e0
ENH: Added command update-offsets to adjust offsets and lengths.
srogmann Aug 28, 2022
25f0ccd
BUG: Clear stream-length at new object.
srogmann May 24, 2024
d8f6669
DEV: Logging migrated from Python's built-in logging to .
srogmann Nov 3, 2024
37b9b9d
TST: Added test of update-offsets using hello.pdf.
srogmann Nov 3, 2024
be39e9b
MAINT: Regex uppercase module constants.
srogmann Nov 3, 2024
2465ffe
DOC: Add update-offsets command
srogmann Nov 3, 2024
8838ca5
MAINT: Added suggested help-attribute.
srogmann Nov 3, 2024
3429c2f
Minor fixups & adding test_update_offsets_on_all_reference_files()
Nov 4, 2024
10ce504
MAINT: Bugfix help-attribute of x2pdf
srogmann Nov 4, 2024
9b0138a
ENH: Support of referenced lengths.
srogmann Nov 4, 2024
e0a32ff
TST: Renamed test PDF file..
srogmann Nov 5, 2024
4f003e5
TST: Renamed test PDF file.
srogmann Nov 5, 2024
47b16d4
TST: rich.console introduces line-breaks in output.
srogmann Nov 5, 2024
dd1be3b
MAINT: Changed /Length detection to support GeoTopo-komprimiert.pdf
srogmann Nov 5, 2024
9146897
MAINT: Changed /Length detection to support output_with_metadata_pymu…
srogmann Nov 5, 2024
6d72f5a
MAINT: Changed /Length detection (PDF ref 3.1 white-space characters)
srogmann Nov 5, 2024
657955b
MAINT: Don't replace pseudo line-breaks in binary parts of a pdf file.
srogmann Nov 5, 2024
5c3b92c
MAINT: EOL can be CR, LF or CRLF.
srogmann Nov 6, 2024
68a352f
TST: Disabled some documents which are not supported.
srogmann Nov 6, 2024
51ed725
MAINT: black (code formatting)
srogmann Nov 6, 2024
c3a6c88
DEV: directory tests is lower-case.
srogmann Nov 6, 2024
fc42eb4
Pleasing mypy & typing imports under Python 3.8
Nov 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ upload:
clean:
python setup.py clean --all
pyclean .
rm -rf Tests/__pycache__ pypdf/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf pypdf.egg-info pypdf_pdfLocation.txt
rm -rf tests/__pycache__ pypdf/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf pypdf.egg-info pypdf_pdfLocation.txt

test:
pytest Tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30
pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30

mutation-test:
mutmut run
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ $ pdfly --help
│ meta Show metadata of a PDF file │
│ pagemeta Give details about a single page. │
│ rm Remove pages from PDF files. │
│ update-offsets Updates offsets and lengths in a simple PDF file. │
│ x2pdf Convert one or more files to PDF. Each file is a page. │
╰─────────────────────────────────────────────────────────────────────────────╯
```
Expand Down
16 changes: 16 additions & 0 deletions pdfly/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import pdfly.pagemeta
import pdfly.rm
import pdfly.up2
import pdfly.update_offsets
import pdfly.x2pdf


Expand Down Expand Up @@ -228,6 +229,21 @@ def compress(
pdfly.compress.main(pdf, output)


@entry_point.command(name="update-offsets", help=pdfly.update_offsets.__doc__) # type: ignore[misc]
def update_offsets(
file_in: Path,
file_out: Path,
encoding: str = typer.Option(
"ISO-8859-1",
help="Encoding used to read and write the files, e.g. UTF-8.",
),
verbose: bool = typer.Option(
False, help="Show progress while processing."
),
) -> None:
pdfly.update_offsets.main(file_in, file_out, encoding, verbose)


@entry_point.command(name="x2pdf", help=pdfly.x2pdf.__doc__) # type: ignore[misc]
def x2pdf(
x: List[Path],
Expand Down
291 changes: 291 additions & 0 deletions pdfly/update_offsets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
"""
Updates offsets and lengths in a simple PDF file.

The PDF specification requires that the xref section at the end
of a PDF file has the correct offsets of the PDF's objects.
It further requires that the dictionary of a stream object
contains a /Length-entry giving the length of the encoded stream.

When editing a PDF file using a text-editor (e.g. vim) it is
elaborate to compute or adjust these offsets and lengths.

This command tries to compute /Length-entries of the stream dictionaries
and the offsets in the xref-section automatically.

It expects that the PDF file has ASCII encoding only. It may
use ISO-8859-1 or UTF-8 in its comments.
The current implementation incorrectly replaces CR (0x0d) by LF (0x0a) in binary data.
It expects that there is one xref-section only.
It expects that the /Length-entries have default values containing
enough digits, e.g. /Length 000 when the stream consists of 576 bytes.

Example:
update-offsets --verbose --encoding ISO-8859-1 issue-297.pdf issue-297.out.pdf

"""

import re
import sys
from pathlib import Path

if sys.version_info >= (3, 9):
List = list
else: # Support for Python 3.8
from typing import List

from rich.console import Console

# Here, only simple regular expressions are used.
# Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better.
RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *")
RE_CONTENT = re.compile(r"^([^\r\n]*)", re.DOTALL)
RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL)
RE_LENGTH = re.compile(
r"^(.*/Length )([0-9]+)([ />\x00\t\f\r\n].*)", re.DOTALL
)


def update_lines(
lines_in: List[str], encoding: str, console: Console, verbose: bool
) -> List[str]:
"""
Iterates over the lines of a pdf-files and updates offsets.

The input is expected to be a pdf without binary-sections.

:param lines_in: A list over the lines including line-breaks.
:param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8".
:param console: Console used to print messages.
:param verbose: True to activate logging of info-messages.
:return The output is a list of lines to be written
in the given encoding.
"""
lines_out = [] # lines to be written
map_line_offset = {} # map from line-number to offset
map_obj_offset = {} # map from object-number to offset
map_obj_line = {} # map from object-number to line-number
line_no = 0 # current line-number (starting at 0)
offset_out = 0 # current offset in output-file
line_xref = None # line-number of xref-line (in xref-section only)
line_startxref = None # line-number of startxref-line
curr_obj = None # number of current object
len_stream = None # length of stream (in stream only)
offset_xref = None # offset of xref-section
map_stream_len = {} # map from object-number to /Length of stream
map_obj_length_line = {} # map from object-number to /Length-line
map_obj_length_ref = (
{}
) # map from object-number to /Length-reference (e.g. "3")
map_obj_length_line_no = {} # map from object-number to line_no of length
# of /Length-line
for idx, line in enumerate(lines_in):
line_no = idx + 1
m_content = RE_CONTENT.match(line)
if m_content is None:
raise RuntimeError(
f"Invalid PDF file: line {line_no} without line-break."
)
content = m_content.group(1)
map_line_offset[line_no] = offset_out
m_obj = RE_OBJ.match(line)
if m_obj is not None:
curr_obj = m_obj.group(1)
curr_gen = m_obj.group(2)
if verbose:
console.print(f"line {line_no}: object {curr_obj}")
if curr_gen != "0":
raise RuntimeError(
f"Invalid PDF file: generation {curr_gen} of object {curr_obj} in line {line_no} is not supported."
)
map_obj_offset[curr_obj] = int(offset_out)
map_obj_line[curr_obj] = line_no
len_stream = None

if content == "xref":
offset_xref = offset_out
line_xref = line_no
elif content == "startxref":
line_startxref = line_no
line_xref = None
elif content == "stream":
if verbose:
console.print(f"line {line_no}: start stream")
len_stream = 0
elif content == "endstream":
if verbose:
console.print(f"line {line_no}: end stream")
if curr_obj is None:
raise RuntimeError(
f"Invalid PDF file: line {line_no}: endstream without object-start."
)
if len_stream is None:
raise RuntimeError(
f"Invalid PDF file: line {line_no}: endstream without stream."
)
if len_stream > 0:
# Ignore the last EOL
len_stream = (
len_stream - 2
if lines_in[idx - 1][-2:] == "\r\n"
else len_stream - 1
)
if verbose:
console.print(
f"line {line_no}: Computed /Length {len_stream} of obj {curr_obj}"
)
map_stream_len[curr_obj] = len_stream
elif content == "endobj":
curr_obj = None
elif curr_obj is not None and len_stream is None:
m_length_ref = RE_LENGTH_REF.match(line)
if m_length_ref is not None:
len_obj = m_length_ref.group(2)
len_obj_gen = m_length_ref.group(3)
if verbose:
console.print(
f"line {line_no}, /Length-reference {len_obj} {len_obj_gen} R: {content}"
)
map_obj_length_ref[curr_obj] = len_obj
else:
m_length = RE_LENGTH.match(line)
if m_length is not None:
if verbose:
console.print(f"line {line_no}, /Length: {content}")
map_obj_length_line[curr_obj] = line
map_obj_length_line_no[curr_obj] = line_no
elif curr_obj is not None and len_stream is not None:
len_stream += len(line.encode(encoding))
elif line_xref is not None and line_no > line_xref + 2:
objNo = line_no - line_xref - 2
if objNo <= len(map_obj_offset) and str(objNo) in map_obj_offset:
eol = line[-2:]
xrefUpd = ("%010d" % map_obj_offset[str(objNo)]) + " 00000 n"
if verbose:
console.print(f"{content} -> {xrefUpd}")
line = xrefUpd + eol
elif line_startxref is not None and line_no == line_startxref + 1:
if offset_xref is None:
raise NotImplementedError(
"Unsupported file: startxref without preceding xref-section (probable cross-reference stream)"
)
line = "%d\n" % offset_xref
lines_out.append(line)

offset_out += len(line.encode(encoding))

# Some checks
if len(map_obj_offset) == 0:
raise RuntimeError(
"Invalid PDF file: the command didn't find any PDF objects."
)
if offset_xref is None:
raise RuntimeError(
"Invalid PDF file: the command didn't find a xref-section"
)
if line_startxref is None:
raise RuntimeError(
"Invalid PDF file: the command didn't find a startxref-section"
)

for curr_obj, stream_len in map_stream_len.items():
if curr_obj in map_obj_length_line:
line = map_obj_length_line[curr_obj]
m_length = RE_LENGTH.match(line)
if m_length is None:
raise RuntimeError(
f"Invalid PDF file: line '{line}' does not contain a valid /Length."
)
prev_length = m_length.group(2)
len_digits = len(prev_length)
len_format = "%%0%dd" % len_digits
updated_length = len_format % stream_len
if len(updated_length) > len_digits:
raise RuntimeError(
f"Not enough digits in /Length-entry {prev_length}"
f" of object {curr_obj}:"
f" too short to take /Length {updated_length}"
)
line = m_length.group(1) + updated_length + m_length.group(3)
lines_out[map_obj_length_line_no[curr_obj] - 1] = line
elif curr_obj in map_obj_length_ref:
len_obj = map_obj_length_ref[curr_obj]
if len_obj not in map_obj_line:
raise RuntimeError(
f"obj {curr_obj} has unknown length-obj {len_obj}"
)
len_obj_line = map_obj_line[len_obj]
prev_length = lines_out[len_obj_line][:-1]
len_digits = len(prev_length)
len_format = "%%0%dd" % len_digits
updated_length = len_format % stream_len
if len(updated_length) > len_digits:
raise RuntimeError(
f"Not enough digits in /Length-ref-entry {prev_length}"
f" of object {curr_obj} and len-object {len_obj}:"
f" too short to take /Length {updated_length}"
)
if prev_length != updated_length:
if verbose:
console.print(
f"line {line_no}, ref-len {len_obj} of {curr_obj}: {prev_length} -> {updated_length}"
)
lines_out[len_obj_line] = updated_length + "\n"
else:
raise RuntimeError(
f"obj {curr_obj} with stream-len {stream_len}"
f" has no object-length-line: {map_obj_length_line}"
)

return lines_out


def read_binary_file(file_path: Path, encoding: str) -> List[str]:
"""
Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
Encoding utf-8 can't be used to read random binary data.

:param file_path: file to be read line by line
:param encoding: encoding to be used (e.g. "iso-8859-1")
:return lines including line-breaks
"""
chunks: List[str] = []
with file_path.open("rb") as file:
buffer = bytearray()
while True:
chunk = file.read(4096) # Read in chunks of 4096 bytes
if not chunk:
break # End of file

buffer += chunk

# Split buffer into chunks based on LF, CR, or CRLF
while True:
match = re.search(b"(\x0D\x0A|\x0A|\x0D)", buffer)
if not match:
break # No more line breaks found, process the remaining buffer

end = match.end()
chunk_str = buffer[:end].decode(encoding, errors="strict")
buffer = buffer[end:]

chunks.append(chunk_str)

# Handle the last chunk
if buffer:
chunks.append(buffer.decode(encoding, errors="strict"))

return chunks


def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
console = Console()
console.print(f"Read {file_in}")

lines_in = read_binary_file(file_in, encoding)
lines_out = update_lines(lines_in, encoding, console, verbose)

with open(file_out, "wb") as f:
for line in lines_out:
f.write(line.encode(encoding))

console.print(f"Wrote {file_out}", soft_wrap=True)
Binary file added resources/file-with-fixed-offsets.pdf
Binary file not shown.
Binary file added resources/file-with-invalid-offsets.pdf
Binary file not shown.
6 changes: 3 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Utilities and fixtures that are available automatically for all tests."""

import io, os
import os
from pathlib import Path

from fpdf import FPDF
Expand Down Expand Up @@ -58,7 +58,7 @@ def pdf_file_100(tmp_path):
for i in range(100):
pdf.add_page()
pdf.set_font("helvetica", size=12)
pdf.cell(200, 10, txt=f"{i}", ln=True, align="C")
pdf.cell(200, 10, text=f"{i}", ln=True, align="C")

pdf_filepath = tmp_path / "pdf_file_100.pdf"
pdf.output(pdf_filepath)
Expand All @@ -73,7 +73,7 @@ def pdf_file_abc(tmp_path):
for char in [chr(i) for i in range(ord("a"), ord("z") + 1)]:
pdf.add_page()
pdf.set_font("helvetica", size=12)
pdf.cell(200, 10, txt=f"{char}", ln=True, align="C")
pdf.cell(200, 10, text=f"{char}", ln=True, align="C")

pdf_filepath = tmp_path / "abc.pdf"
pdf.output(pdf_filepath)
Expand Down
Loading
Loading