Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Make DictionaryObject.readFromStream check if any entry values refer to the null object #326

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
52 changes: 40 additions & 12 deletions PyPDF2/generic.py
Expand Up @@ -51,7 +51,7 @@
IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]"))


def readObject(stream, pdf):
def readObject(stream, pdf, check_dict_values=True):
tok = stream.read(1)
stream.seek(-1, 1) # reset to start
idx = ObjectPrefix.find(tok)
Expand All @@ -63,7 +63,7 @@ def readObject(stream, pdf):
peek = stream.read(2)
stream.seek(-2, 1) # reset to start
if peek == b_('<<'):
return DictionaryObject.readFromStream(stream, pdf)
return DictionaryObject.readFromStream(stream, pdf, check_dict_values)
else:
return readHexStringFromStream(stream)
elif idx == 2:
Expand Down Expand Up @@ -104,8 +104,15 @@ def getObject(self):
"""Resolves indirect references."""
return self

def isNull(self):
"""Check if this object refers to the null object."""
return False


class NullObject(PdfObject):
def isNull(self):
return True

def writeToStream(self, stream, encryption_key):
stream.write(b_("null"))

Expand Down Expand Up @@ -178,6 +185,9 @@ def __init__(self, idnum, generation, pdf):
def getObject(self):
return self.pdf.getObject(self).getObject()

def isNull(self):
return not self.pdf.objectExists(self)

def __repr__(self):
return "IndirectObject(%r, %r)" % (self.idnum, self.generation)

Expand Down Expand Up @@ -552,7 +562,7 @@ def writeToStream(self, stream, encryption_key):
stream.write(b_("\n"))
stream.write(b_(">>"))

def readFromStream(stream, pdf):
def readFromStream(stream, pdf, check_dict_values=True):
debug = False
tmp = stream.read(2)
if tmp != b_("<<"):
Expand All @@ -579,17 +589,35 @@ def readFromStream(stream, pdf):
tok = readNonWhitespace(stream)
stream.seek(-1, 1)
value = readObject(stream, pdf)
if not data.get(key):
data[key] = value
elif pdf.strict:
# multiple definitions of key not permitted
raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \
% (utils.hexStr(stream.tell()), key))
else:
warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \
% (utils.hexStr(stream.tell()), key), utils.PdfReadWarning)
data.setdefault(key, []).append((value, stream.tell()))

pos = stream.tell()

data_cleaned = {}
for key in data:
final_value = None
for value, offset in data[key]:
# Keys with null values (or indirect references to undefined objects)
# are treated as if they do not exist.
if check_dict_values and value.isNull():
if debug: print("Cleaning value {0} of key {1}.".format(value, key))
continue

if final_value is not None:
if pdf.strict:
# multiple definitions of key not permitted
raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \
% (utils.hexStr(offset), key))
else:
warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \
% (utils.hexStr(offset), key), utils.PdfReadWarning)
else:
final_value = value

if final_value is not None:
data_cleaned[key] = final_value
data = data_cleaned

s = readNonWhitespace(stream)
if s == b_('s') and stream.read(5) == b_('tream'):
eol = stream.read(1)
Expand Down
27 changes: 26 additions & 1 deletion PyPDF2/pdf.py
Expand Up @@ -114,6 +114,11 @@ def _addObject(self, obj):
self._objects.append(obj)
return IndirectObject(len(self._objects), 0, self)

def objectExists(self, ido):
if ido.pdf != self:
raise ValueError("pdf must be self")
return ido.idnum - 1 < len(self._objects)

def getObject(self, ido):
if ido.pdf != self:
raise ValueError("pdf must be self")
Expand Down Expand Up @@ -1645,6 +1650,24 @@ def _getObjectFromStream(self, indirectReference):
if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.")
return NullObject()

def objectExists(self, indirectReference):
retval = self.cacheGetIndirectObject(indirectReference.generation,
indirectReference.idnum)
if retval != None:
# Object is cached. It probably exists.
return True
elif indirectReference.generation == 0 and \
indirectReference.idnum in self.xref_objStm:
# Object is inside an object stream.
# Just assume the object exists. The PDF is errorneous otherwise.
return True
elif indirectReference.generation in self.xref and \
indirectReference.idnum in self.xref[indirectReference.generation]:
# Object is present in the xref table.
return True
else:
return False

def getObject(self, indirectReference):
debug = False
if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation))
Expand Down Expand Up @@ -1865,7 +1888,9 @@ def read(self, stream):
# PDF 1.5+ Cross-Reference Stream
stream.seek(-1, 1)
idnum, generation = self.readObjectHeader(stream)
xrefstream = readObject(stream, self)
# Disable checking values of the stream dict because indirect
# references cannot be checked before the xref stream is parsed.
xrefstream = readObject(stream, self, check_dict_values=False)
assert xrefstream["/Type"] == "/XRef"
self.cacheIndirectObject(generation, idnum, xrefstream)
streamData = BytesIO(b_(xrefstream.getData()))
Expand Down
Binary file added Resources/multipledefs.pdf
Binary file not shown.
6 changes: 6 additions & 0 deletions Tests/tests.py
Expand Up @@ -36,6 +36,12 @@ def test_PdfReaderFileLoad(self):
msg='PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
% (pdftext, ipdf_p1_text.encode('utf-8', errors='ignore')))

def test_PdfReaderMultipleDefinitions(self):
with open(os.path.join(RESOURCE_ROOT, 'multipledefs.pdf'), 'rb') as inputfile:
# Load PDF file from file
ipdf = PdfFileReader(inputfile)
ipdf_p1 = ipdf.getPage(0)


class AddJsTestCase(unittest.TestCase):

Expand Down