From f88a1b022de5822261abec4e7dd9fa0e457a4ee7 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 7 Sep 2022 23:59:19 +0200 Subject: [PATCH 1/2] ROB : fix infinite loop due to Invalid object fixes #1329 *prevent loop within dictionnaries where objects not respecting standard *fix cmap warnings due to "numbered" characters ( #2d instead of -) *apply unnumbering to nameobject *add _get_indirect_object for debug/dev purpose *add some missing seeks (no issue reported yet) --- PyPDF2/_cmap.py | 3 ++- PyPDF2/_reader.py | 9 +++++++ PyPDF2/generic/_base.py | 13 ++++++++-- PyPDF2/generic/_data_structures.py | 39 +++++++++++++++++++----------- tests/test_generic.py | 11 +++++++++ 5 files changed, 58 insertions(+), 17 deletions(-) diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py index 58f644599..1165f7d0b 100644 --- a/PyPDF2/_cmap.py +++ b/PyPDF2/_cmap.py @@ -5,7 +5,7 @@ from ._codecs import adobe_glyphs, charset_encoding from ._utils import logger_warning from .errors import PdfReadWarning -from .generic import DecodedStreamObject, DictionaryObject +from .generic import DecodedStreamObject, DictionaryObject, NameObject # code freely inspired from @twiggy ; see #711 @@ -124,6 +124,7 @@ def parse_encoding( enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore if isinstance(enc, str): try: + enc = NameObject.unnumber(enc) # for #xx decoding if enc in charset_encoding: encoding = charset_encoding[enc].copy() elif enc in _predefined_cmap: diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 7707adf36..48aa3c80b 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1139,6 +1139,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: buf = bytes(self.stream.getbuffer()) # type: ignore else: p = self.stream.tell() + p.seek(0, 0) buf = self.stream.read(-1) self.stream.seek(p, 0) m = re.search( @@ -1192,6 +1193,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: buf = bytes(self.stream.getbuffer()) # type: ignore else: p = self.stream.tell() + self.stream.seek(0, 0) buf = self.stream.read(-1) self.stream.seek(p, 0) m = re.search( @@ -1883,6 +1885,13 @@ def xfa(self) -> Optional[Dict[str, Any]]: retval[tag] = es return retval + def _get_indirect_object(self, num: int, gen: int) -> PdfObject: + """ + used to ease development + equivalent to generic.IndirectObject(num,gen,self).get_object() + """ + return IndirectObject(num, gen, self).get_object() + class PdfFileReader(PdfReader): # pragma: no cover def __init__(self, *args: Any, **kwargs: Any) -> None: diff --git a/PyPDF2/generic/_base.py b/PyPDF2/generic/_base.py index c3a2b1f8a..b968ef2bb 100644 --- a/PyPDF2/generic/_base.py +++ b/PyPDF2/generic/_base.py @@ -420,6 +420,14 @@ def writeToStream( deprecate_with_replacement("writeToStream", "write_to_stream") self.write_to_stream(stream, encryption_key) + @staticmethod + def unnumber(sin: str) -> str: + i = sin.find("#") + while i >= 0: + sin = sin[:i] + chr(int(sin[i + 1 : i + 3], 16)) + sin[i + 3 :] + i = sin.find("#") + return sin + @staticmethod def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader name = stream.read(1) @@ -431,10 +439,11 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader ret = name.decode("utf-8") except (UnicodeEncodeError, UnicodeDecodeError): ret = name.decode("gbk") - return NameObject(ret) - except (UnicodeEncodeError, UnicodeDecodeError) as e: # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number + ret = NameObject.unnumber(ret) + return NameObject(ret) + except (UnicodeEncodeError, UnicodeDecodeError) as e: if not pdf.strict: logger_warning("Illegal character in Name Object", __name__) return NameObject(name) diff --git a/PyPDF2/generic/_data_structures.py b/PyPDF2/generic/_data_structures.py index 283b33b22..03579dec3 100644 --- a/PyPDF2/generic/_data_structures.py +++ b/PyPDF2/generic/_data_structures.py @@ -67,7 +67,6 @@ from ._utils import read_hex_string_from_stream, read_string_from_stream logger = logging.getLogger(__name__) -ObjectPrefix = b"/<[tf(n%" NumberSigns = b"+-" IndirectPattern = re.compile(rb"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]") @@ -263,10 +262,19 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader stream.read(1) break stream.seek(-1, 1) - key = read_object(stream, pdf) - tok = read_non_whitespace(stream) - stream.seek(-1, 1) - value = read_object(stream, pdf, forced_encoding) + try: + key = read_object(stream, pdf) + tok = read_non_whitespace(stream) + stream.seek(-1, 1) + value = read_object(stream, pdf, forced_encoding) + except Exception as e: + if pdf is not None and pdf.strict: + raise PdfReadError(e.__repr__()) + logger_warning(e.__repr__(), __name__) + retval = DictionaryObject() + retval.update(data) + return retval # return partial data + if not data.get(key): data[key] = value else: @@ -812,10 +820,9 @@ def read_object( ) -> Union[PdfObject, int, str, ContentStream]: tok = stream.read(1) stream.seek(-1, 1) # reset to start - idx = ObjectPrefix.find(tok) - if idx == 0: + if tok == b"/": return NameObject.read_from_stream(stream, pdf) - elif idx == 1: + elif tok == b"<": # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start @@ -824,15 +831,15 @@ def read_object( return DictionaryObject.read_from_stream(stream, pdf, forced_encoding) else: return read_hex_string_from_stream(stream, forced_encoding) - elif idx == 2: + elif tok == b"[": return ArrayObject.read_from_stream(stream, pdf, forced_encoding) - elif idx == 3 or idx == 4: + elif tok == b"t" or tok == b"f": return BooleanObject.read_from_stream(stream) - elif idx == 5: + elif tok == b"(": return read_string_from_stream(stream, forced_encoding) - elif idx == 6: + elif tok == b"n": return NullObject.read_from_stream(stream) - elif idx == 7: + elif tok == b"%": # comment while tok not in (b"\r", b"\n"): tok = stream.read(1) @@ -843,7 +850,7 @@ def read_object( tok = read_non_whitespace(stream) stream.seek(-1, 1) return read_object(stream, pdf, forced_encoding) - else: + elif tok in b"0123456789+-.": # number object OR indirect reference peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start @@ -851,6 +858,10 @@ def read_object( return IndirectObject.read_from_stream(stream, pdf) else: return NumberObject.read_from_stream(stream) + else: + raise PdfReadError( + f"Invalid Elementary Object starting with {tok} @{stream.tell()}" + ) class Field(TreeObject): diff --git a/tests/test_generic.py b/tests/test_generic.py index 4ca18308d..d0798c5f6 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -175,6 +175,17 @@ def test_NameObject(): with pytest.raises(PdfReadError) as exc: NameObject.read_from_stream(stream, None) assert exc.value.args[0] == "name read error" + assert ( + NameObject.read_from_stream( + BytesIO(b"/A;Name_With-Various***Characters?"), None + ) + == "/A;Name_With-Various***Characters?" + ) + assert ( + NameObject.read_from_stream(BytesIO(b"/paired#28#29parentheses"), None) + == "/paired()parentheses" + ) + assert NameObject.read_from_stream(BytesIO(b"/A#42"), None) == "/AB" def test_destination_fit_r(): From d5e9547a09ede1652dff7b7d465d92612d18c1fd Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 8 Sep 2022 00:17:08 +0200 Subject: [PATCH 2/2] mypy --- PyPDF2/_reader.py | 4 ++-- PyPDF2/generic/_data_structures.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 48aa3c80b..052d13cab 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1139,7 +1139,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: buf = bytes(self.stream.getbuffer()) # type: ignore else: p = self.stream.tell() - p.seek(0, 0) + self.stream.seek(0, 0) buf = self.stream.read(-1) self.stream.seek(p, 0) m = re.search( @@ -1885,7 +1885,7 @@ def xfa(self) -> Optional[Dict[str, Any]]: retval[tag] = es return retval - def _get_indirect_object(self, num: int, gen: int) -> PdfObject: + def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: """ used to ease development equivalent to generic.IndirectObject(num,gen,self).get_object() diff --git a/PyPDF2/generic/_data_structures.py b/PyPDF2/generic/_data_structures.py index 03579dec3..64183608b 100644 --- a/PyPDF2/generic/_data_structures.py +++ b/PyPDF2/generic/_data_structures.py @@ -267,10 +267,10 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader tok = read_non_whitespace(stream) stream.seek(-1, 1) value = read_object(stream, pdf, forced_encoding) - except Exception as e: + except Exception as exc: if pdf is not None and pdf.strict: - raise PdfReadError(e.__repr__()) - logger_warning(e.__repr__(), __name__) + raise PdfReadError(exc.__repr__()) + logger_warning(exc.__repr__(), __name__) retval = DictionaryObject() retval.update(data) return retval # return partial data @@ -860,7 +860,7 @@ def read_object( return NumberObject.read_from_stream(stream) else: raise PdfReadError( - f"Invalid Elementary Object starting with {tok} @{stream.tell()}" + f"Invalid Elementary Object starting with {tok} @{stream.tell()}" # type: ignore )