Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for file_bytes argument with managed_file_context() #15

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 46 additions & 7 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# -*- coding: utf-8 -*-

from contextlib import contextmanager
import io
import os
import sys

Expand All @@ -8,12 +10,13 @@
from .core import TableList
from .parsers import Stream, Lattice
from .utils import (
InvalidArguments,
TemporaryDirectory,
get_page_layout,
get_text_objects,
get_rotation,
is_url,
download_url,
get_url_bytes,
)


Expand All @@ -24,19 +27,33 @@ class PDFHandler(object):

Parameters
----------
filepath : str
Filepath or URL of the PDF file.
filepath : str | pathlib.Path, optional (default: None)
Filepath or URL of the PDF file. Required if file_bytes is not given
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.
file_bytes : io.IOBase, optional (default: None)
A file-like stream. Required if filepath is not given

"""

def __init__(self, filepath, pages="1", password=None):
def __init__(self, filepath=None, pages="1", password=None, file_bytes=None):
if is_url(filepath):
filepath = download_url(filepath)
file_bytes = get_url_bytes(filepath)

if not filepath and not file_bytes:
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
if not filepath:
# filepath must either be passed, or taken from the name attribute
filepath = getattr(file_bytes, 'name')
if not filepath:
msg = ('Either pass a `filepath`, or give the '
'`file_bytes` argument a name attribute')
raise InvalidArguments(msg)
self.file_bytes = file_bytes # ok to be None

self.filepath = filepath
if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")
Expand All @@ -49,6 +66,28 @@ def __init__(self, filepath, pages="1", password=None):
self.password = self.password.encode("ascii")
self.pages = self._get_pages(pages)

@contextmanager
def managed_file_context(self):
"""Reads from either the `filepath` or `file_bytes`
attribute of this instance, to return a file-like object.
Closes any open file handles on exit or error.

Returns
-------
file_bytes : io.IOBase
A readable, seekable, file-like object
"""
if self.file_bytes:
# if we can't seek, write to a BytesIO object that can,
# then seek to the beginning before yielding
if not hasattr(self.file_bytes, 'seek'):
self.file_bytes = io.BytesIO(self.file_bytes.read())
self.file_bytes.seek(0)
yield self.file_bytes
else:
with open(self.filepath, "rb") as file_bytes:
yield file_bytes

def _get_pages(self, pages):
"""Converts pages string to list of ints.

Expand All @@ -71,7 +110,7 @@ def _get_pages(self, pages):
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
else:
with open(self.filepath, "rb") as f:
with self.managed_file_context() as f:
infile = PdfReader(f, strict=False)

if infile.is_encrypted:
Expand Down Expand Up @@ -107,7 +146,7 @@ def _save_page(self, filepath, page, temp):
Tmp directory.

"""
with open(filepath, "rb") as fileobj:
with self.managed_file_context() as fileobj:
infile = PdfReader(fileobj, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
Expand Down
20 changes: 15 additions & 5 deletions camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,21 @@
import warnings

from .handlers import PDFHandler
from .utils import validate_input, remove_extra
from .utils import (
InvalidArguments,
validate_input,
remove_extra,
)


def read_pdf(
filepath,
filepath=None,
pages="1",
password=None,
flavor="lattice",
suppress_stdout=False,
layout_kwargs={},
file_bytes=None,
**kwargs
):
"""Read PDF and return extracted tables.
Expand All @@ -22,8 +27,8 @@ def read_pdf(

Parameters
----------
filepath : str
Filepath or URL of the PDF file.
filepath : str | pathlib.Path, optional (default: None)
Filepath or URL of the PDF file. Required if file_bytes is not given
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Expand All @@ -34,6 +39,8 @@ def read_pdf(
Lattice is used by default.
suppress_stdout : bool, optional (default: True)
Print all logs and warnings.
file_bytes : io.IOBase, optional (default: None)
A file-like stream. Required if filepath is not given
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
table_areas : list, optional (default: None)
Expand Down Expand Up @@ -103,12 +110,15 @@ def read_pdf(
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
)

if not filepath and not file_bytes:
raise InvalidArguments('Either `filepath` or `file_bytes` is required')

with warnings.catch_warnings():
if suppress_stdout:
warnings.simplefilter("ignore")

validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password)
p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(
flavor=flavor,
Expand Down
37 changes: 20 additions & 17 deletions camelot/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

import os
import io
import re
import random
import shutil
Expand Down Expand Up @@ -36,6 +36,10 @@
_VALID_URLS.discard("")


class InvalidArguments(Exception):
pass


# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
def is_url(url):
"""Check to see if a URL has a valid protocol.
Expand Down Expand Up @@ -66,31 +70,30 @@ def random_string(length):
return ret


def download_url(url):
"""Download file from specified URL.
def get_url_bytes(url):
"""Get a stream of bytes for url

Parameters
----------
url : str or unicode

Returns
-------
filepath : str or unicode
Temporary filepath.
file_bytes : io.BytesIO
a file-like object that cane be read

"""
filename = f"{random_string(6)}.pdf"
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
headers = {"User-Agent": "Mozilla/5.0"}
request = Request(url, None, headers)
obj = urlopen(request)
content_type = obj.info().get_content_type()
if content_type != "application/pdf":
raise NotImplementedError("File format not supported")
f.write(obj.read())
filepath = os.path.join(os.path.dirname(f.name), filename)
shutil.move(f.name, filepath)
return filepath
file_bytes = io.BytesIO()
file_bytes.name = url
headers = {"User-Agent": "Mozilla/5.0"}
request = Request(url, data=None, headers=headers)
obj = urlopen(request)
content_type = obj.info().get_content_type()
if content_type != "application/pdf":
raise NotImplementedError("File format not supported")
file_bytes.write(obj.read())
file_bytes.seek(0)
return file_bytes


stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
Expand Down
18 changes: 18 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-

import io
import os
import sys

Expand Down Expand Up @@ -172,3 +173,20 @@ def test_handler_pages_generator():

handler = PDFHandler(filename)
assert handler._get_pages("1,2,5-10") == [1, 2, 5, 6, 7, 8, 9, 10]


def test_from_open():
filename = os.path.join(testdir, "foo.pdf")
with open(filename, "rb") as file_bytes:
tables = camelot.read_pdf(file_bytes=file_bytes)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"

def test_from_bytes():
filename = os.path.join(testdir, "foo.pdf")
file_bytes = io.BytesIO()
with open(filename, "rb") as f:
file_bytes.write(f.read()) # note that we didn't seek, done by PDFHandler
tables = camelot.read_pdf(file_bytes=file_bytes)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"