Skip to content

Commit

Permalink
ROB: Fix infinite loop due to Invalid object (#1331)
Browse files Browse the repository at this point in the history
Fixes #1329

* Prevent loop within dictionaries caused by objects not respecting the PDF standard
* Fix cmap warnings due to "numbered" characters ( #2d instead of -)
* Apply unnumbering to NameObject
* Add _get_indirect_object for debugging and development
* Add some missing seeks (no issue reported yet)
  • Loading branch information
pubpub-zz committed Sep 9, 2022
1 parent 2f77698 commit e6531a2
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 17 deletions.
3 changes: 2 additions & 1 deletion PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ._codecs import adobe_glyphs, charset_encoding
from ._utils import logger_warning
from .errors import PdfReadWarning
from .generic import DecodedStreamObject, DictionaryObject
from .generic import DecodedStreamObject, DictionaryObject, NameObject


# code freely inspired from @twiggy ; see #711
Expand Down Expand Up @@ -124,6 +124,7 @@ def parse_encoding(
enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
if isinstance(enc, str):
try:
enc = NameObject.unnumber(enc) # for #xx decoding
if enc in charset_encoding:
encoding = charset_encoding[enc].copy()
elif enc in _predefined_cmap:
Expand Down
9 changes: 9 additions & 0 deletions PyPDF2/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1139,6 +1139,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
buf = bytes(self.stream.getbuffer()) # type: ignore
else:
p = self.stream.tell()
self.stream.seek(0, 0)
buf = self.stream.read(-1)
self.stream.seek(p, 0)
m = re.search(
Expand Down Expand Up @@ -1192,6 +1193,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
buf = bytes(self.stream.getbuffer()) # type: ignore
else:
p = self.stream.tell()
self.stream.seek(0, 0)
buf = self.stream.read(-1)
self.stream.seek(p, 0)
m = re.search(
Expand Down Expand Up @@ -1883,6 +1885,13 @@ def xfa(self) -> Optional[Dict[str, Any]]:
retval[tag] = es
return retval

def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]:
"""
used to ease development
equivalent to generic.IndirectObject(num,gen,self).get_object()
"""
return IndirectObject(num, gen, self).get_object()


class PdfFileReader(PdfReader): # pragma: no cover
def __init__(self, *args: Any, **kwargs: Any) -> None:
Expand Down
13 changes: 11 additions & 2 deletions PyPDF2/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,14 @@ def writeToStream(
deprecate_with_replacement("writeToStream", "write_to_stream")
self.write_to_stream(stream, encryption_key)

@staticmethod
def unnumber(sin: str) -> str:
i = sin.find("#")
while i >= 0:
sin = sin[:i] + chr(int(sin[i + 1 : i + 3], 16)) + sin[i + 3 :]
i = sin.find("#")
return sin

@staticmethod
def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
name = stream.read(1)
Expand All @@ -431,10 +439,11 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
ret = name.decode("utf-8")
except (UnicodeEncodeError, UnicodeDecodeError):
ret = name.decode("gbk")
return NameObject(ret)
except (UnicodeEncodeError, UnicodeDecodeError) as e:
# Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number
ret = NameObject.unnumber(ret)
return NameObject(ret)
except (UnicodeEncodeError, UnicodeDecodeError) as e:
if not pdf.strict:
logger_warning("Illegal character in Name Object", __name__)
return NameObject(name)
Expand Down
39 changes: 25 additions & 14 deletions PyPDF2/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@
from ._utils import read_hex_string_from_stream, read_string_from_stream

logger = logging.getLogger(__name__)
ObjectPrefix = b"/<[tf(n%"
NumberSigns = b"+-"
IndirectPattern = re.compile(rb"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]")

Expand Down Expand Up @@ -263,10 +262,19 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader
stream.read(1)
break
stream.seek(-1, 1)
key = read_object(stream, pdf)
tok = read_non_whitespace(stream)
stream.seek(-1, 1)
value = read_object(stream, pdf, forced_encoding)
try:
key = read_object(stream, pdf)
tok = read_non_whitespace(stream)
stream.seek(-1, 1)
value = read_object(stream, pdf, forced_encoding)
except Exception as exc:
if pdf is not None and pdf.strict:
raise PdfReadError(exc.__repr__())
logger_warning(exc.__repr__(), __name__)
retval = DictionaryObject()
retval.update(data)
return retval # return partial data

if not data.get(key):
data[key] = value
else:
Expand Down Expand Up @@ -812,10 +820,9 @@ def read_object(
) -> Union[PdfObject, int, str, ContentStream]:
tok = stream.read(1)
stream.seek(-1, 1) # reset to start
idx = ObjectPrefix.find(tok)
if idx == 0:
if tok == b"/":
return NameObject.read_from_stream(stream, pdf)
elif idx == 1:
elif tok == b"<":
# hexadecimal string OR dictionary
peek = stream.read(2)
stream.seek(-2, 1) # reset to start
Expand All @@ -824,15 +831,15 @@ def read_object(
return DictionaryObject.read_from_stream(stream, pdf, forced_encoding)
else:
return read_hex_string_from_stream(stream, forced_encoding)
elif idx == 2:
elif tok == b"[":
return ArrayObject.read_from_stream(stream, pdf, forced_encoding)
elif idx == 3 or idx == 4:
elif tok == b"t" or tok == b"f":
return BooleanObject.read_from_stream(stream)
elif idx == 5:
elif tok == b"(":
return read_string_from_stream(stream, forced_encoding)
elif idx == 6:
elif tok == b"n":
return NullObject.read_from_stream(stream)
elif idx == 7:
elif tok == b"%":
# comment
while tok not in (b"\r", b"\n"):
tok = stream.read(1)
Expand All @@ -843,14 +850,18 @@ def read_object(
tok = read_non_whitespace(stream)
stream.seek(-1, 1)
return read_object(stream, pdf, forced_encoding)
else:
elif tok in b"0123456789+-.":
# number object OR indirect reference
peek = stream.read(20)
stream.seek(-len(peek), 1) # reset to start
if IndirectPattern.match(peek) is not None:
return IndirectObject.read_from_stream(stream, pdf)
else:
return NumberObject.read_from_stream(stream)
else:
raise PdfReadError(
f"Invalid Elementary Object starting with {tok} @{stream.tell()}" # type: ignore
)


class Field(TreeObject):
Expand Down
11 changes: 11 additions & 0 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,17 @@ def test_NameObject():
with pytest.raises(PdfReadError) as exc:
NameObject.read_from_stream(stream, None)
assert exc.value.args[0] == "name read error"
assert (
NameObject.read_from_stream(
BytesIO(b"/A;Name_With-Various***Characters?"), None
)
== "/A;Name_With-Various***Characters?"
)
assert (
NameObject.read_from_stream(BytesIO(b"/paired#28#29parentheses"), None)
== "/paired()parentheses"
)
assert NameObject.read_from_stream(BytesIO(b"/A#42"), None) == "/AB"


def test_destination_fit_r():
Expand Down

0 comments on commit e6531a2

Please sign in to comment.