Avoid pyarrow.fs import for local storage (#14321)

This is not a resolution, but may help mitigate problems from aws/aws-sdk-cpp#2681 Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Peter Andreas Entschev (https://github.com/pentschev) - Lawrence Mitchell (https://github.com/wence-) - Bradley Dice (https://github.com/bdice) URL: #14321
rapidsai · Oct 24, 2023 · 19d791c · 19d791c
1 parent b390bca
commit 19d791c
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 4 deletions.
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
@@ -5,7 +5,6 @@
 
 import pyarrow as pa
 from fsspec.utils import stringify_path
-from pyarrow import orc as orc
 
 import cudf
 from cudf._lib import orc as liborc
@@ -17,6 +16,8 @@
 
 
 def _make_empty_df(filepath_or_buffer, columns):
+    from pyarrow import orc
+
     orc_file = orc.ORCFile(filepath_or_buffer)
     schema = orc_file.schema
     col_names = schema.names if columns is None else columns
@@ -150,6 +151,7 @@ def _parse_column_statistics(cs, column_statistics_blob):
 @ioutils.doc_read_orc_metadata()
 def read_orc_metadata(path):
     """{docstring}"""
+    from pyarrow import orc
 
     orc_file = orc.ORCFile(path)
 
@@ -380,6 +382,7 @@ def read_orc(
             )
         )
     else:
+        from pyarrow import orc
 
         def read_orc_stripe(orc_file, stripe, columns):
             pa_table = orc_file.read_stripe(stripe, columns)

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 import pandas as pd
-from pyarrow import dataset as ds, parquet as pq
+from pyarrow import dataset as ds
 
 import cudf
 from cudf._lib import parquet as libparquet
@@ -266,6 +266,7 @@ def write_to_dataset(
 @_cudf_nvtx_annotate
 def read_parquet_metadata(path):
     """{docstring}"""
+    import pyarrow.parquet as pq
 
     pq_file = pq.ParquetFile(path)
 
@@ -303,7 +304,9 @@ def _process_dataset(
 
     # Convert filters to ds.Expression
     if filters is not None:
-        filters = pq.filters_to_expression(filters)
+        from pyarrow.parquet import filters_to_expression
+
+        filters = filters_to_expression(filters)
 
     # Initialize ds.FilesystemDataset
     # TODO: Remove the if len(paths) workaround after following bug is fixed:
@@ -825,6 +828,8 @@ def _read_parquet(
             use_pandas_metadata=use_pandas_metadata,
         )
     else:
+        import pyarrow.parquet as pq
+
         return cudf.DataFrame.from_arrow(
             pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                 columns=columns, *args, **kwargs
@@ -930,6 +935,8 @@ def to_parquet(
         )
 
     else:
+        import pyarrow.parquet as pq
+
         if partition_offsets is not None:
             warnings.warn(
                 "partition_offsets will be ignored when engine is not cudf"

diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
@@ -533,3 +533,17 @@ def test_write_chunked_parquet(s3_base, s3so):
             actual.sort_values(["b"]).reset_index(drop=True),
             cudf.concat([df1, df2]).sort_values(["b"]).reset_index(drop=True),
         )
+
+
+def test_no_s3fs_on_cudf_import():
+    import subprocess
+    import sys
+
+    output = subprocess.check_output(
+        [
+            sys.executable,
+            "-c",
+            "import cudf; import sys; print('pyarrow._s3fs' in sys.modules)",
+        ]
+    )
+    assert output.strip() == b"False"
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
@@ -13,7 +13,6 @@
 import pandas as pd
 from fsspec.core import get_fs_token_paths
 from pyarrow import PythonFile as ArrowPythonFile
-from pyarrow.fs import FSSpecHandler, PyFileSystem
 from pyarrow.lib import NativeFile
 
 from cudf.utils.docutils import docfmt_partial
@@ -1630,6 +1629,15 @@ def _open_remote_files(
             for path, rgs in zip(paths, row_groups)
         ]
 
+    # Avoid top-level pyarrow.fs import.
+    # Importing pyarrow.fs initializes a S3 SDK with a finalizer
+    # that runs atexit. In some circumstances it appears this
+    # runs a call into a logging system that is already shutdown.
+    # To avoid this, we only import this subsystem if it is
+    # really needed.
+    # See https://github.com/aws/aws-sdk-cpp/issues/2681
+    from pyarrow.fs import FSSpecHandler, PyFileSystem
+
     # Default open - Use pyarrow filesystem API
     pa_fs = PyFileSystem(FSSpecHandler(fs))
     return [