diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index dfe80cba480d5..66708b2f42beb 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -1028,6 +1028,21 @@ def read_json( >>> ds.take(1) [{'order_number': 10107, 'quantity': 30, 'year': '2022', 'month': '09'}] + When reading large files, the default block size configured in PyArrow can be too small, + resulting in the following error: + ``pyarrow.lib.ArrowInvalid: straddling object straddles two block boundaries + (try to increase block size?)``. + + To resolve this, use the ``read_options`` parameter to set a larger block size: + + >>> import pyarrow.json as pajson + >>> block_size = 10 << 20 # Set block size to 10MB + >>> ray.data.read_json( + ... "s3://anonymous@ray-example-data/log.json", + ... read_options=pajson.ReadOptions(block_size=block_size) + ... ) + Dataset(num_blocks=8, num_rows=1, schema={timestamp: timestamp[s], size: int64}) + Args: paths: A single file or directory, or a list of file or directory paths. A list of paths can contain both files and directories.