Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Uses buffered input for inline images to speed up reading #390

Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 24 additions & 27 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2789,35 +2789,32 @@ def _readInlineImage(self, stream):
# left at beginning of ID
tmp = stream.read(3)
assert tmp[:2] == b_("ID")
#data = self._readImageData(stream)
data = self._readImagaDataFast(stream)
return {"settings": settings, "data": data}

def _readImagaDataFast(self, stream):
# We keep more than buffersize bytes in the buffer because the
# end of image sequence might overlap. So we search some data twice,
# but this is still far more effective than the old algorithm
buffersize = 1024 * 1024 # Extracting in megabyte chunks
buffertail = 256
regex = re.compile(b_("(.*?)(EI\\s+)Q\\s+"), re.DOTALL)
data = b_("")

buffer = stream.read(buffersize+buffertail)

while True:
# Read the inline image, while checking for EI (End Image) operator.
tok = stream.read(1)
if tok == b_("E"):
# Check for End Image
tok2 = stream.read(1)
if tok2 == b_("I"):
# Data can contain EI, so check for the Q operator.
tok3 = stream.read(1)
info = tok + tok2
# We need to find whitespace between EI and Q.
has_q_whitespace = False
while tok3 in utils.WHITESPACES:
has_q_whitespace = True
info += tok3
tok3 = stream.read(1)
if tok3 == b_("Q") and has_q_whitespace:
stream.seek(-1, 1)
break
else:
stream.seek(-1,1)
data += info
else:
stream.seek(-1, 1)
data += tok
else:
data += tok
return {"settings": settings, "data": data}
match = regex.match(buffer)
if match:
data += buffer[:len(match.group(1))]
stream.seek(-1 * (len(buffer) - len(match.group(1)) - len(match.group(2))), 1)
return data

if len(buffer) < buffersize + buffertail: # We already have exhausted the stream
raise utils.PdfReadError("Didn't find end of image marker!")
data += buffer[:buffersize]
buffer = buffer[buffersize:] + stream.read(buffersize)

def _getData(self):
newdata = BytesIO()
Expand Down