-
Notifications
You must be signed in to change notification settings - Fork 5.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Data] Auto increasing block size for read_json #42357
Changes from 11 commits
4857e95
f6902fe
ebe1cfc
afeac34
360dc40
4c34794
da74990
59a3e36
f5b387a
b5a6ffc
838933b
0177858
76dead9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,11 +1,15 @@ | ||||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union | ||||||
|
||||||
from ray.data._internal.dataset_logger import DatasetLogger | ||||||
from ray.data.context import DataContext | ||||||
from ray.data.datasource.file_based_datasource import FileBasedDatasource | ||||||
from ray.util.annotations import PublicAPI | ||||||
|
||||||
if TYPE_CHECKING: | ||||||
import pyarrow | ||||||
|
||||||
logger = DatasetLogger(__name__) | ||||||
|
||||||
|
||||||
@PublicAPI | ||||||
class JSONDatasource(FileBasedDatasource): | ||||||
|
@@ -34,6 +38,54 @@ def __init__( | |||||
|
||||||
# TODO(ekl) The PyArrow JSON reader doesn't support streaming reads. | ||||||
def _read_stream(self, f: "pyarrow.NativeFile", path: str): | ||||||
from pyarrow import json | ||||||
from io import BytesIO | ||||||
|
||||||
from pyarrow import ArrowInvalid, json | ||||||
|
||||||
# When reading large files, the default block size configured in PyArrow can be | ||||||
# too small, resulting in the following error: `pyarrow.lib.ArrowInvalid: | ||||||
# straddling object straddles two block boundaries (try to increase block | ||||||
# size?)`. The read will be retried with geometrically increasing block size | ||||||
# until the size reaches `DataContext.get_current().target_max_block_size`. | ||||||
# The initial block size will start at the PyArrow default block size or it can | ||||||
# be manually set through the `read_options` parameter as follows. | ||||||
# | ||||||
# >>> import pyarrow.json as pajson | ||||||
# >>> block_size = 10 << 20 # Set block size to 10MB | ||||||
# >>> ray.data.read_json( # doctest: +SKIP | ||||||
# ... "s3://anonymous@ray-example-data/log.json", | ||||||
# ... read_options=pajson.ReadOptions(block_size=block_size) | ||||||
# ... ) | ||||||
|
||||||
yield json.read_json(f, read_options=self.read_options, **self.arrow_json_args) | ||||||
buffer = f.read_buffer() | ||||||
init_block_size = self.read_options.block_size | ||||||
max_block_size = DataContext.get_current().target_max_block_size | ||||||
while True: | ||||||
try: | ||||||
yield json.read_json( | ||||||
BytesIO(buffer), | ||||||
read_options=self.read_options, | ||||||
**self.arrow_json_args, | ||||||
) | ||||||
self.read_options.block_size = init_block_size | ||||||
break | ||||||
except ArrowInvalid as e: | ||||||
if isinstance(e, ArrowInvalid) and "straddling" in str(e): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we don't need this check inside the except block right? let's also compare to a longer string in case there are other error messages which use the word straddling:
Suggested change
|
||||||
if self.read_options.block_size < max_block_size: | ||||||
# Increase the block size in case it was too small. | ||||||
logger.get_logger(log_to_stdout=False).info( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's use True for this since it is important, so that the user always sees this message in stdout logs |
||||||
f"JSONDatasource read failed with " | ||||||
f"block_size={self.read_options.block_size}. Retrying with " | ||||||
f"block_size={self.read_options.block_size * 2}." | ||||||
) | ||||||
self.read_options.block_size *= 2 | ||||||
else: | ||||||
raise ArrowInvalid( | ||||||
f"{e} - Auto-increasing block size to " | ||||||
f"{self.read_options.block_size}B failed. " | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit
Suggested change
|
||||||
f"More information on this issue can be found here: " | ||||||
f"https://github.com/apache/arrow/issues/25674" | ||||||
) | ||||||
else: | ||||||
# unrelated error, simply reraise | ||||||
raise e |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: can we also include the arrow issue link here?