From 39528541e6bc63c85049194ed3db6016522b8ca7 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 2 Mar 2021 14:45:58 -0800 Subject: [PATCH 01/54] update pyarrow to 3.0.0 --- conda/environments/cudf_dev_cuda10.1.yml | 4 +-- conda/environments/cudf_dev_cuda10.2.yml | 4 +-- conda/environments/cudf_dev_cuda11.0.yml | 4 +-- conda/recipes/cudf/meta.yaml | 2 +- conda/recipes/libcudf/meta.yaml | 2 +- cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 2 +- python/cudf/cudf/_lib/gpuarrow.pyx | 4 +-- python/cudf/cudf/_lib/transpose.pyx | 6 ++-- python/cudf/cudf/_lib/utils.pyx | 13 ++++---- python/cudf/cudf/core/column/categorical.py | 36 +++++++++------------ python/cudf/cudf/core/column/column.py | 16 ++++----- python/cudf/cudf/core/dataframe.py | 11 ++++--- python/cudf/cudf/core/frame.py | 7 ++-- python/cudf/cudf/core/index.py | 14 ++++---- python/cudf/cudf/io/csv.py | 2 +- python/cudf/cudf/testing/testing.py | 4 +-- python/cudf/cudf/tests/__init__.py | 0 python/cudf/cudf/tests/test_dataframe.py | 4 +-- 18 files changed, 63 insertions(+), 72 deletions(-) delete mode 100644 python/cudf/cudf/tests/__init__.py diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index 3541ed1208c..e9ae53f8789 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -18,7 +18,7 @@ dependencies: - numba>=0.49.0,!=0.51.0 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=1.0.1 + - pyarrow=3.0.0 - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -47,7 +47,7 @@ dependencies: - distributed>=2.22.0 - streamz - dlpack - - arrow-cpp=1.0.1 + - arrow-cpp=3.0.0 - arrow-cpp-proc * cuda - boost-cpp>=1.72.0 - double-conversion diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index 839533516fb..22e2d22378e 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -18,7 +18,7 @@ dependencies: - numba>=0.49,!=0.51.0 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=1.0.1 + - pyarrow=3.0.0 - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -47,7 +47,7 @@ dependencies: - distributed>=2.22.0 - streamz - dlpack - - arrow-cpp=1.0.1 + - arrow-cpp=3.0.0 - arrow-cpp-proc * cuda - boost-cpp>=1.72.0 - double-conversion diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 401eaea63da..38ef346778c 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -18,7 +18,7 @@ dependencies: - numba>=0.49,!=0.51.0 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=1.0.1 + - pyarrow=3.0.0 - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -47,7 +47,7 @@ dependencies: - distributed>=2.22.0 - streamz - dlpack - - arrow-cpp=1.0.1 + - arrow-cpp=3.0.0 - arrow-cpp-proc * cuda - boost-cpp>=1.72.0 - double-conversion diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 5635f54ba20..f8ecb711d9b 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -27,7 +27,7 @@ requirements: - setuptools - numba >=0.49.0 - dlpack - - pyarrow 1.0.1 + - pyarrow 3.0.0 - libcudf {{ version }} - rmm {{ minor_version }} - cudatoolkit {{ cuda_version }} diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 885a22870bb..4490a3547e0 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -37,7 +37,7 @@ requirements: host: - librmm {{ minor_version }}.* - cudatoolkit {{ cuda_version }}.* - - arrow-cpp 1.0.1 + - arrow-cpp 3.0.0 - arrow-cpp-proc * cuda - boost-cpp 1.72.0 - dlpack diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index 425a9af897d..f9ddbc80583 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -118,6 +118,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) endfunction() -set(CUDF_VERSION_Arrow 1.0.1) +set(CUDF_VERSION_Arrow 3.0.0) find_and_configure_arrow(${CUDF_VERSION_Arrow} ${ARROW_STATIC_LIB}) diff --git a/python/cudf/cudf/_lib/gpuarrow.pyx b/python/cudf/cudf/_lib/gpuarrow.pyx index 6513cd59424..a7da22637b9 100644 --- a/python/cudf/cudf/_lib/gpuarrow.pyx +++ b/python/cudf/cudf/_lib/gpuarrow.pyx @@ -15,7 +15,7 @@ from pyarrow.includes.libarrow cimport ( CRecordBatchStreamReader ) from pyarrow.lib cimport ( - _CRecordBatchReader, + RecordBatchReader, Buffer, Schema, pyarrow_wrap_schema @@ -23,7 +23,7 @@ from pyarrow.lib cimport ( import pyarrow as pa -cdef class CudaRecordBatchStreamReader(_CRecordBatchReader): +cdef class CudaRecordBatchStreamReader(RecordBatchReader): cdef: CIpcReadOptions options diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index d2b053789cd..fd02a08c49e 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -36,11 +36,11 @@ def transpose(Table source): if is_categorical_dtype(dtype): if any(not is_categorical_dtype(c.dtype) for c in source._columns): raise ValueError('Columns must all have the same dtype') - cats = list(c.cat().categories for c in source._columns) + cats = list(c.cat.categories for c in source._columns) cats = cudf.Series(cudf.concat(cats)).drop_duplicates()._column source = Table(index=source._index, data=[ - (name, col.cat()._set_categories( - col.cat().categories, cats, is_unique=True).codes) + (name, col.cat._set_categories( + col.cat.categories, cats, is_unique=True).codes) for name, col in source._data.items() ]) elif dtype.kind in 'OU': diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 4c4ef17c6b9..26101c8bf7f 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -116,12 +116,13 @@ cpdef generate_pandas_metadata(Table table, index): index_descriptors.append(descr) metadata = pa.pandas_compat.construct_metadata( - table, - col_names, - index_levels, - index_descriptors, - index, - types, + columns_to_convert=table._data.columns, + df=table, + column_names=col_names, + index_levels=index_levels, + index_descriptors=index_descriptors, + preserve_index=index, + types=types, ) md_dict = json.loads(metadata[b"pandas"]) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index c41a458f02b..7f8f1e7228f 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -936,8 +936,9 @@ def ordered(self) -> Optional[bool]: def ordered(self, value: bool): self.dtype.ordered = value - def cat(self, parent: ParentType = None): - return CategoricalAccessor(self, parent=parent) + @property + def cat(self): + return CategoricalAccessor(self, parent=None) def unary_operator(self, unaryop: str): raise TypeError( @@ -1085,7 +1086,7 @@ def to_pandas( col = self signed_dtype = min_signed_type(len(col.categories)) - codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array() + codes = col.cat.codes.astype(signed_dtype).fillna(-1).to_array() categories = col.categories.dropna(drop_nan=True).to_pandas() data = pd.Categorical.from_codes( codes, categories=categories, ordered=col.ordered @@ -1198,13 +1199,11 @@ def find_and_replace( # named 'index', which came from the filtered categories, # contains the new ints that we need to map to to_replace_col = column.as_column(catmap.index).astype( - self.cat().codes.dtype - ) - replacement_col = catmap["index"]._column.astype( - self.cat().codes.dtype + self.cat.codes.dtype ) + replacement_col = catmap["index"]._column.astype(self.cat.codes.dtype) - replaced = column.as_column(self.cat().codes) + replaced = column.as_column(self.cat.codes) output = libcudf.replace.replace( replaced, to_replace_col, replacement_col ) @@ -1282,10 +1281,8 @@ def fillna( ) # TODO: only required if fill_value has a subset of the # categories: - fill_value = fill_value.cat()._set_categories( - fill_value.cat().categories, - self.categories, - is_unique=True, + fill_value = fill_value.cat._set_categories( + fill_value.cat.categories, self.categories, is_unique=True, ) fill_value = column.as_column(fill_value.codes).astype( self.codes.dtype @@ -1363,7 +1360,7 @@ def as_categorical_column( # return a column full of Nulls. return _create_empty_categorical_column(self, dtype) - return self.cat().set_categories( + return self.cat.set_categories( new_categories=dtype.categories, ordered=dtype.ordered ) @@ -1388,8 +1385,8 @@ def as_timedelta_column(self, dtype, **kwargs) -> TimeDeltaColumn: def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): # self.categories is empty; just return codes - return self.cat().codes._column - gather_map = self.cat().codes.astype("int32").fillna(0)._column + return self.cat.codes._column + gather_map = self.cat.codes.astype("int32").fillna(0)._column out = self.categories.take(gather_map) out = out.set_mask(self.mask) return out @@ -1422,9 +1419,7 @@ def copy(self, deep: bool = True) -> CategoricalColumn: ) def __sizeof__(self) -> int: - return ( - self.cat().categories.__sizeof__() + self.cat().codes.__sizeof__() - ) + return self.cat.categories.__sizeof__() + self.cat.codes.__sizeof__() def _memory_usage(self, **kwargs) -> int: deep = kwargs.get("deep", False) @@ -1432,8 +1427,7 @@ def _memory_usage(self, **kwargs) -> int: return self.__sizeof__() else: return ( - self.categories._memory_usage() - + self.cat().codes.memory_usage() + self.categories._memory_usage() + self.cat.codes.memory_usage() ) def _mimic_inplace( @@ -1459,7 +1453,7 @@ def _create_empty_categorical_column( cudf.utils.utils.scalar_broadcast_to( categorical_column.default_na_value(), categorical_column.size, - np.dtype(categorical_column.cat().codes), + np.dtype(categorical_column.cat.codes), ) ), offset=categorical_column.offset, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 1bad2c3a451..dd59e0909a0 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -63,6 +63,7 @@ from cudf.utils.utils import mask_dtype T = TypeVar("T", bound="ColumnBase") +ParentType = Union["cudf.Series", "cudf.Index"] class ColumnBase(Column, Serializable): @@ -188,9 +189,8 @@ def __sizeof__(self) -> int: n += bitmask_allocation_size_bytes(self.size) return n - def cat( - self, parent=None - ) -> "cudf.core.column.categorical.CategoricalAccessor": + @property + def cat(self) -> "cudf.core.column.categorical.CategoricalAccessor": raise NotImplementedError() def str(self, parent=None) -> "cudf.core.column.string.StringMethods": @@ -253,21 +253,19 @@ def _concat( if is_categorical: # Combine and de-dupe the categories cats = ( - cudf.concat([o.cat().categories for o in objs]) + cudf.concat([o.cat.categories for o in objs]) .to_series() .drop_duplicates(ignore_index=True) ._column ) objs = [ - o.cat()._set_categories( - o.cat().categories, cats, is_unique=True - ) + o.cat._set_categories(o.cat.categories, cats, is_unique=True) for o in objs ] # Map `objs` into a list of the codes until we port Categorical to # use the libcudf++ Category data type. - objs = [o.cat().codes._column for o in objs] - head = head.cat().codes._column + objs = [o.cat.codes._column for o in objs] + head = head.cat.codes._column newsize = sum(map(len, objs)) if newsize > libcudf.MAX_COLUMN_SIZE: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 18a7f052d62..c0e1b8995aa 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5396,11 +5396,12 @@ def to_arrow(self, preserve_index=True): out = super(DataFrame, data).to_arrow() metadata = pa.pandas_compat.construct_metadata( - self, - out.schema.names, - [self.index], - index_descr, - preserve_index, + columns_to_convert=self._data.columns, + df=self, + column_names=out.schema.names, + index_levels=[self.index], + index_descriptors=index_descr, + preserve_index=preserve_index, types=out.schema.types, ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 926aad368b0..f413537bbeb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3792,7 +3792,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): ): # Combine and de-dupe the categories categories[idx] = ( - cudf.concat([col.cat().categories for col in cols]) + cudf.concat([col.cat.categories for col in cols]) .to_series() .drop_duplicates(ignore_index=True) ._column @@ -3823,9 +3823,8 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): if idx in categories: cols[idx] = ( cols[idx] - .cat() - ._set_categories( - cols[idx].cat().categories, + .cat._set_categories( + cols[idx].cat.categories, categories[idx], is_unique=True, ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8c86352b2a7..b605d51af90 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2672,17 +2672,15 @@ def __new__( dtype = None if categories is not None: - data.cat().set_categories( - categories, ordered=ordered, inplace=True - ) + data.cat.set_categories(categories, ordered=ordered, inplace=True) elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): - data.cat().set_categories( + data.cat.set_categories( dtype.categories, ordered=ordered, inplace=True ) elif ordered is True and data.ordered is False: - data.cat().as_ordered(inplace=True) + data.cat.as_ordered(inplace=True) elif ordered is False and data.ordered is True: - data.cat().as_unordered(inplace=True) + data.cat.as_unordered(inplace=True) out._initialize(data, **kwargs) @@ -2693,14 +2691,14 @@ def codes(self): """ The category codes of this categorical. """ - return self._values.cat().codes + return self._values.cat.codes @property def categories(self): """ The categories of this categorical. """ - return self._values.cat().categories + return self._values.cat.categories class StringIndex(GenericIndex): diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index e2c7ca7dca1..f9b60cfce0b 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -177,7 +177,7 @@ def to_csv( df = df.copy(deep=False) for col_name, col in df._data.items(): if isinstance(col, cudf.core.column.CategoricalColumn): - df._data[col_name] = col.astype(col.cat().categories.dtype) + df._data[col_name] = col.astype(col.cat.categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): df.index = df.index.astype(df.index.categories.dtype) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index bacab24a6f3..9930327d89d 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -162,8 +162,8 @@ def assert_column_equal( if check_exact and check_categorical: if is_categorical_dtype(left) and is_categorical_dtype(right): - left_cat = left.cat().categories - right_cat = right.cat().categories + left_cat = left.cat.categories + right_cat = right.cat.categories if check_category_order: assert_index_equal( diff --git a/python/cudf/cudf/tests/__init__.py b/python/cudf/cudf/tests/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 69f6ecfeb17..b04d2ca3da8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5149,8 +5149,8 @@ def test_memory_usage_cat(): gdf = cudf.from_pandas(df) expected = ( - gdf.B._column.cat().categories.__sizeof__() - + gdf.B._column.cat().codes.__sizeof__() + gdf.B._column.cat.categories.__sizeof__() + + gdf.B._column.cat.codes.__sizeof__() ) # Check cat column From 74603fbc5e78918042397b76d5464250a2265afc Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 3 Mar 2021 14:11:49 -0600 Subject: [PATCH 02/54] move utility code out of tests folder --- python/cudf/cudf/_fuzz_testing/avro.py | 2 +- python/cudf/cudf/_fuzz_testing/csv.py | 2 +- python/cudf/cudf/_fuzz_testing/json.py | 2 +- python/cudf/cudf/_fuzz_testing/orc.py | 2 +- python/cudf/cudf/_fuzz_testing/parquet.py | 2 +- python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py | 2 +- python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py | 2 +- python/cudf/cudf/_fuzz_testing/utils.py | 2 +- python/cudf/cudf/{tests => testing}/dataset_generator.py | 0 python/cudf/cudf/{tests => testing}/utils.py | 0 python/cudf/cudf/tests/test_apply_rows.py | 2 +- python/cudf/cudf/tests/test_applymap.py | 2 +- python/cudf/cudf/tests/test_array_function.py | 2 +- python/cudf/cudf/tests/test_array_ufunc.py | 7 ++++--- .../cudf/tests/test_avro_reader_fastavro_integration.py | 2 +- python/cudf/cudf/tests/test_binops.py | 2 +- python/cudf/cudf/tests/test_categorical.py | 2 +- python/cudf/cudf/tests/test_column.py | 8 ++++---- python/cudf/cudf/tests/test_column_accessor.py | 2 +- python/cudf/cudf/tests/test_concat.py | 2 +- python/cudf/cudf/tests/test_contains.py | 2 +- python/cudf/cudf/tests/test_copying.py | 2 +- python/cudf/cudf/tests/test_csv.py | 2 +- python/cudf/cudf/tests/test_cuda_apply.py | 2 +- python/cudf/cudf/tests/test_cuda_array_interface.py | 2 +- python/cudf/cudf/tests/test_custom_accessor.py | 4 ++-- python/cudf/cudf/tests/test_dataframe.py | 4 ++-- python/cudf/cudf/tests/test_dataframe_copy.py | 2 +- python/cudf/cudf/tests/test_datasets.py | 2 +- python/cudf/cudf/tests/test_datetime.py | 2 +- python/cudf/cudf/tests/test_dlpack.py | 2 +- python/cudf/cudf/tests/test_dropna.py | 2 +- python/cudf/cudf/tests/test_dtypes.py | 4 ++-- python/cudf/cudf/tests/test_duplicates.py | 2 +- python/cudf/cudf/tests/test_factorize.py | 2 +- python/cudf/cudf/tests/test_feather.py | 2 +- python/cudf/cudf/tests/test_fill.py | 2 +- python/cudf/cudf/tests/test_gcs.py | 2 +- python/cudf/cudf/tests/test_gpu_arrow_parser.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 2 +- python/cudf/cudf/tests/test_hdf.py | 2 +- python/cudf/cudf/tests/test_hdfs.py | 2 +- python/cudf/cudf/tests/test_index.py | 2 +- python/cudf/cudf/tests/test_indexing.py | 8 ++++++-- python/cudf/cudf/tests/test_interval.py | 2 +- python/cudf/cudf/tests/test_joining.py | 2 +- python/cudf/cudf/tests/test_json.py | 2 +- python/cudf/cudf/tests/test_list.py | 2 +- python/cudf/cudf/tests/test_monotonic.py | 2 +- python/cudf/cudf/tests/test_multiindex.py | 2 +- python/cudf/cudf/tests/test_numerical.py | 2 +- python/cudf/cudf/tests/test_numpy_interop.py | 2 +- python/cudf/cudf/tests/test_onehot.py | 2 +- python/cudf/cudf/tests/test_ops.py | 2 +- python/cudf/cudf/tests/test_orc.py | 6 +++++- python/cudf/cudf/tests/test_pandas_interop.py | 2 +- python/cudf/cudf/tests/test_parquet.py | 4 ++-- python/cudf/cudf/tests/test_pickling.py | 2 +- python/cudf/cudf/tests/test_quantiles.py | 2 +- python/cudf/cudf/tests/test_query.py | 2 +- python/cudf/cudf/tests/test_query_mask.py | 2 +- python/cudf/cudf/tests/test_rank.py | 2 +- python/cudf/cudf/tests/test_reductions.py | 4 ++-- python/cudf/cudf/tests/test_replace.py | 2 +- python/cudf/cudf/tests/test_repr.py | 2 +- python/cudf/cudf/tests/test_reshape.py | 2 +- python/cudf/cudf/tests/test_rolling.py | 2 +- python/cudf/cudf/tests/test_s3.py | 2 +- python/cudf/cudf/tests/test_scalar.py | 2 +- python/cudf/cudf/tests/test_scan.py | 7 ++++++- python/cudf/cudf/tests/test_search.py | 2 +- python/cudf/cudf/tests/test_serialize.py | 4 ++-- python/cudf/cudf/tests/test_series.py | 2 +- python/cudf/cudf/tests/test_seriesmap.py | 4 ++-- python/cudf/cudf/tests/test_setitem.py | 2 +- python/cudf/cudf/tests/test_sorting.py | 2 +- python/cudf/cudf/tests/test_sparse_df.py | 2 +- python/cudf/cudf/tests/test_stats.py | 2 +- python/cudf/cudf/tests/test_string.py | 2 +- python/cudf/cudf/tests/test_struct.py | 2 +- python/cudf/cudf/tests/test_testing.py | 2 +- python/cudf/cudf/tests/test_text.py | 2 +- python/cudf/cudf/tests/test_timedelta.py | 4 ++-- python/cudf/cudf/tests/test_transform.py | 2 +- python/cudf/cudf/tests/test_unaops.py | 2 +- python/custreamz/custreamz/tests/test_kafka.py | 2 +- python/dask_cudf/dask_cudf/tests/test_accessor.py | 2 +- python/dask_cudf/dask_cudf/tests/test_core.py | 6 +++--- python/dask_cudf/dask_cudf/tests/test_distributed.py | 2 +- 89 files changed, 117 insertions(+), 103 deletions(-) rename python/cudf/cudf/{tests => testing}/dataset_generator.py (100%) rename python/cudf/cudf/{tests => testing}/utils.py (100%) diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py index a07e3acf416..4c167ac627f 100644 --- a/python/cudf/cudf/_fuzz_testing/avro.py +++ b/python/cudf/cudf/_fuzz_testing/avro.py @@ -15,7 +15,7 @@ pandas_to_avro, pyarrow_to_pandas, ) -from cudf.tests import dataset_generator as dg +from cudf.testing import dataset_generator as dg logging.basicConfig( format="%(asctime)s %(levelname)-8s %(message)s", diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py index 84346ed61ad..0acb9c8a471 100644 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ b/python/cudf/cudf/_fuzz_testing/csv.py @@ -12,7 +12,7 @@ _generate_rand_meta, pyarrow_to_pandas, ) -from cudf.tests import dataset_generator as dg +from cudf.testing import dataset_generator as dg from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes logging.basicConfig( diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index 5ecb27f7665..df9226cf059 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -13,7 +13,7 @@ _generate_rand_meta, pyarrow_to_pandas, ) -from cudf.tests import dataset_generator as dg +from cudf.testing import dataset_generator as dg from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes logging.basicConfig( diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 607294a49c9..2aa01eb3967 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -16,7 +16,7 @@ pandas_to_orc, pyarrow_to_pandas, ) -from cudf.tests import dataset_generator as dg +from cudf.testing import dataset_generator as dg logging.basicConfig( format="%(asctime)s %(levelname)-8s %(message)s", diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py index 4a9b63cd6aa..ca08e3348c0 100644 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ b/python/cudf/cudf/_fuzz_testing/parquet.py @@ -12,7 +12,7 @@ _generate_rand_meta, pyarrow_to_pandas, ) -from cudf.tests import dataset_generator as dg +from cudf.testing import dataset_generator as dg logging.basicConfig( format="%(asctime)s %(levelname)-8s %(message)s", diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py index e6a5d081980..ff2ccd19696 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py @@ -13,7 +13,7 @@ compare_content, run_test, ) -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq @pythonfuzz(data_handle=CSVReader) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py index f3da03f447b..6c9d564d088 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py @@ -9,7 +9,7 @@ from cudf._fuzz_testing.json import JSONReader, JSONWriter from cudf._fuzz_testing.main import pythonfuzz from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES, run_test -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq @pythonfuzz(data_handle=JSONReader) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index efcbd8ca792..6869e40492c 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -9,7 +9,7 @@ import pyorc import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq from cudf.utils.dtypes import ( pandas_dtypes_to_cudf_dtypes, pyarrow_dtypes_to_pandas_dtypes, diff --git a/python/cudf/cudf/tests/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py similarity index 100% rename from python/cudf/cudf/tests/dataset_generator.py rename to python/cudf/cudf/testing/dataset_generator.py diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/testing/utils.py similarity index 100% rename from python/cudf/cudf/tests/utils.py rename to python/cudf/cudf/testing/utils.py diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py index 0ba80278fca..721e13b670f 100644 --- a/python/cudf/cudf/tests/test_apply_rows.py +++ b/python/cudf/cudf/tests/test_apply_rows.py @@ -2,7 +2,7 @@ import cudf from cudf.core.column import column -from cudf.tests.utils import assert_eq, gen_rand_series +from cudf.testing.utils import assert_eq, gen_rand_series def _kernel_multiply(a, b, out): diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index 1f35bc93c78..eeacf05b33b 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -7,7 +7,7 @@ import pytest from cudf import Series -from cudf.tests import utils +from cudf.testing import utils @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 03f9cf1d7e5..71804cb717e 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -4,7 +4,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq from cudf.utils.utils import IS_NEP18_ACTIVE missing_arrfunc_cond = not IS_NEP18_ACTIVE diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index f9e0bb2ce8a..c888d32276b 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -1,9 +1,10 @@ -import cudf -import numpy as np import cupy as cp +import numpy as np import pandas as pd import pytest -from cudf.tests.utils import assert_eq + +import cudf +from cudf.testing.utils import assert_eq @pytest.fixture diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index a52ee937574..050f98d5ed5 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -18,7 +18,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq def cudf_from_avro_util(schema, records): diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 579716f8277..c07efc90f10 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -14,7 +14,7 @@ import cudf from cudf.core import Series from cudf.core.index import as_index -from cudf.tests import utils +from cudf.testing import utils from cudf.utils.dtypes import ( BOOL_TYPES, DATETIME_TYPES, diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index a117c15f14d..042bdea81f5 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -9,7 +9,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing.utils import assert_eq, assert_exceptions_equal @pytest.fixture diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 9509cabc117..10192ea85ba 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -8,7 +8,7 @@ import cudf from cudf._lib.transform import mask_to_bools from cudf.core.column.column import as_column -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing.utils import assert_eq, assert_exceptions_equal from cudf.utils import dtypes as dtypeutils dtypes = sorted( @@ -97,8 +97,8 @@ def test_column_series_multi_dim(data): @pytest.mark.parametrize( ("data", "error"), [ - ([1, "1.0", "2", -3], TypeError), - ([np.nan, 0, "null", cp.nan], TypeError), + ([1, "1.0", "2", -3], pa.lib.ArrowInvalid), + ([np.nan, 0, "null", cp.nan], pa.lib.ArrowInvalid), ( [np.int32(4), np.float64(1.5), np.float32(1.290994), np.int8(0)], None, @@ -109,7 +109,7 @@ def test_column_mixed_dtype(data, error): if error is None: cudf.Series(data) else: - with pytest.raises(TypeError): + with pytest.raises(error): cudf.Series(data) diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 86a7927dcac..7342c04d0db 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -6,7 +6,7 @@ import cudf from cudf.core.column_accessor import ColumnAccessor -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq simple_test_data = [ {}, diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index d0e31a82b28..beb505b34d0 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -7,7 +7,7 @@ import pytest import cudf as gd -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing.utils import assert_eq, assert_exceptions_equal from cudf.utils.dtypes import is_categorical_dtype diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index 4737faf65a4..ee5e87c2b2c 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -5,7 +5,7 @@ from cudf import Series from cudf.core.index import RangeIndex, as_index -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq def cudf_date_series(start, stop, freq): diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index ed6a1169a2a..249cdd3c310 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -6,7 +6,7 @@ import cudf from cudf.core import Series -from cudf.tests.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq +from cudf.testing.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index d972d2ad11c..3633873c73d 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -15,7 +15,7 @@ import cudf from cudf import read_csv -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing.utils import assert_eq, assert_exceptions_equal def make_numeric_dataframe(nrows, dtype): diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py index fa880da6804..c608e67ac3e 100644 --- a/python/cudf/cudf/tests/test_cuda_apply.py +++ b/python/cudf/cudf/tests/test_cuda_apply.py @@ -9,7 +9,7 @@ from numba import cuda from cudf import DataFrame -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq @pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129]) diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 42e5ab38f50..47ce8ac1132 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -10,7 +10,7 @@ from numba import cuda import cudf -from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq +from cudf.testing.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py index d72b5875677..b032a8d0eda 100644 --- a/python/cudf/cudf/tests/test_custom_accessor.py +++ b/python/cudf/cudf/tests/test_custom_accessor.py @@ -2,9 +2,9 @@ import pandas as pd import pytest -import cudf as gd -from cudf.tests.utils import assert_eq +import cudf as gd +from cudf.testing.utils import assert_eq @gd.api.extensions.register_dataframe_accessor("point") diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b04d2ca3da8..60bd6a0b801 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -20,8 +20,8 @@ import cudf from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 from cudf.core.column import column -from cudf.tests import utils -from cudf.tests.utils import ( +from cudf.testing import utils +from cudf.testing.utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py index 35788e660ea..8c9a1b42ae7 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/test_dataframe_copy.py @@ -7,7 +7,7 @@ from numba import cuda from cudf.core.dataframe import DataFrame -from cudf.tests.utils import ALL_TYPES, assert_eq +from cudf.testing.utils import ALL_TYPES, assert_eq """ DataFrame copy expectations diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py index a603a6b4658..ccb66fc7306 100644 --- a/python/cudf/cudf/tests/test_datasets.py +++ b/python/cudf/cudf/tests/test_datasets.py @@ -1,7 +1,7 @@ import numpy as np import cudf as gd -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq def test_dataset_timeseries(): diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 7eb8fcd0aa4..d572dbd4a36 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -14,7 +14,7 @@ import cudf from cudf.core import DataFrame, Series from cudf.core.index import DatetimeIndex -from cudf.tests.utils import ( +from cudf.testing.utils import ( DATETIME_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index b8175d05137..d72b6a49f72 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq nelems = [0, 3, 10] dtype = [np.uint16, np.int32, np.float64] diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index d01627309d6..684eed62168 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index b6e2aac0304..0f1a2b7fe59 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -9,11 +9,11 @@ from cudf.core.dtypes import ( CategoricalDtype, Decimal64Dtype, + IntervalDtype, ListDtype, StructDtype, - IntervalDtype, ) -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq def test_cdt_basic(): diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index f721b7a28e5..457adbd4836 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -9,7 +9,7 @@ import cudf from cudf import concat -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing.utils import assert_eq, assert_exceptions_equal # TODO: PANDAS 1.0 support # Revisit drop_duplicates() tests to update parameters like ignore_index. diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index 61d11fa5961..60ba46277f4 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -7,7 +7,7 @@ import cudf from cudf.core import DataFrame, Index -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py index 525b88fc7ff..61362edb8b9 100644 --- a/python/cudf/cudf/tests/test_feather.py +++ b/python/cudf/cudf/tests/test_feather.py @@ -10,7 +10,7 @@ import pytest import cudf -from cudf.tests.utils import NUMERIC_TYPES, assert_eq +from cudf.testing.utils import NUMERIC_TYPES, assert_eq if LooseVersion(pd.__version__) < LooseVersion("0.24"): try: diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py index 83d15b36e64..e6904328065 100644 --- a/python/cudf/cudf/tests/test_fill.py +++ b/python/cudf/cudf/tests/test_fill.py @@ -2,7 +2,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py index 5d287a57df8..181b31f5327 100644 --- a/python/cudf/cudf/tests/test_gcs.py +++ b/python/cudf/cudf/tests/test_gcs.py @@ -10,7 +10,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq gcsfs = pytest.importorskip("gcsfs") diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py index e3c8e69695d..96b051a53b4 100644 --- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py +++ b/python/cudf/cudf/tests/test_gpu_arrow_parser.py @@ -8,7 +8,7 @@ import cudf from cudf.comm.gpuarrow import GpuArrowReader -from cudf.tests.utils import INTEGER_TYPES +from cudf.testing.utils import INTEGER_TYPES def make_gpu_parse_arrow_data_batch(): diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 8011510d340..e6d91b87034 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -12,7 +12,7 @@ import cudf from cudf.core import DataFrame, Series from cudf.core._compat import PANDAS_GE_110 -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing.utils import assert_eq, assert_exceptions_equal _now = np.datetime64("now") _tomorrow = _now + np.timedelta64(1, "D") diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py index f908d5f51f5..f1d573a5ca2 100644 --- a/python/cudf/cudf/tests/test_hdf.py +++ b/python/cudf/cudf/tests/test_hdf.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq +from cudf.testing.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq try: import tables # noqa F401 diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py index e3867c620fe..b26315b02fd 100644 --- a/python/cudf/cudf/tests/test_hdfs.py +++ b/python/cudf/cudf/tests/test_hdfs.py @@ -11,7 +11,7 @@ from pyarrow import orc as orc import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq if not os.environ.get("RUN_HDFS_TESTS"): pytestmark = pytest.mark.skip("Env not configured to run HDFS tests") diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 688efef555b..c705ef98138 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -20,7 +20,7 @@ RangeIndex, as_index, ) -from cudf.tests.utils import ( +from cudf.testing.utils import ( FLOAT_TYPES, NUMERIC_TYPES, OTHER_TYPES, diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 73a074c0376..25a0694a4e5 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -9,8 +9,12 @@ import cudf from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 -from cudf.tests import utils -from cudf.tests.utils import INTEGER_TYPES, assert_eq, assert_exceptions_equal +from cudf.testing import utils +from cudf.testing.utils import ( + INTEGER_TYPES, + assert_eq, + assert_exceptions_equal, +) index_dtypes = INTEGER_TYPES diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index c7eafedd409..680ce6ee597 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -4,7 +4,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 969cf1bf549..4461a38fcf9 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -7,7 +7,7 @@ import cudf from cudf.core._compat import PANDAS_GE_120 from cudf.core.dtypes import CategoricalDtype -from cudf.tests.utils import ( +from cudf.testing.utils import ( INTEGER_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index e0a922f35fe..ea85075d766 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -12,7 +12,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq +from cudf.testing.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq def make_numeric_dataframe(nrows, dtype): diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 195d8749ec6..07ddf0028f2 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index b26887ad6ae..86304fd3057 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -16,7 +16,7 @@ RangeIndex, StringIndex, ) -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)]) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index bd78612d6c7..182fd1dc6ea 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -15,7 +15,7 @@ import cudf from cudf.core.column import as_column from cudf.core.index import as_index -from cudf.tests.utils import assert_eq, assert_exceptions_equal, assert_neq +from cudf.testing.utils import assert_eq, assert_exceptions_equal, assert_neq def test_multiindex_levels_codes_validation(): diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 6d9bcda2c0b..ed23741b39c 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -6,7 +6,7 @@ import cudf from cudf.core._compat import PANDAS_GE_100 -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq def test_can_cast_safely_same_kind(): diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py index 521840f8a8a..951af77b155 100644 --- a/python/cudf/cudf/tests/test_numpy_interop.py +++ b/python/cudf/cudf/tests/test_numpy_interop.py @@ -2,7 +2,7 @@ import pytest from cudf.core import DataFrame, Series -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq def test_to_records_noindex(): diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index f7d9f03832a..1cc2d90e501 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -8,7 +8,7 @@ import cudf from cudf.core import DataFrame, GenericIndex, Series -from cudf.tests import utils +from cudf.testing import utils def test_onehot_simple(): diff --git a/python/cudf/cudf/tests/test_ops.py b/python/cudf/cudf/tests/test_ops.py index 8cdef19d9ba..b7228739cfa 100644 --- a/python/cudf/cudf/tests/test_ops.py +++ b/python/cudf/cudf/tests/test_ops.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq, gen_rand +from cudf.testing.utils import assert_eq, gen_rand def test_sqrt_float(): diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index ed91e909f25..ff3c81a6cc1 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -13,7 +13,11 @@ import cudf from cudf.io.orc import ORCWriter -from cudf.tests.utils import assert_eq, gen_rand_series, supported_numpy_dtypes +from cudf.testing.utils import ( + assert_eq, + gen_rand_series, + supported_numpy_dtypes, +) @pytest.fixture(scope="module") diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py index 24c60f12a2f..ac1b39c9219 100644 --- a/python/cudf/cudf/tests/test_pandas_interop.py +++ b/python/cudf/cudf/tests/test_pandas_interop.py @@ -5,7 +5,7 @@ import cudf from cudf.core import DataFrame -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq def test_to_pandas(): diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index dc4d0615a7f..526e5adfba9 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -17,8 +17,8 @@ import cudf from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata -from cudf.tests import dataset_generator as dg -from cudf.tests.utils import assert_eq +from cudf.testing import dataset_generator as dg +from cudf.testing.utils import assert_eq @pytest.fixture(scope="module") diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index e87ab3730dd..6ca55e625bf 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -8,7 +8,7 @@ from cudf.core import DataFrame, GenericIndex, Series from cudf.core.buffer import Buffer -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq if sys.version_info < (3, 8): try: diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py index 49a2603b9a3..f72a8a5fc71 100644 --- a/python/cudf/cudf/tests/test_quantiles.py +++ b/python/cudf/cudf/tests/test_quantiles.py @@ -1,7 +1,7 @@ import pandas as pd import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq def test_single_q(): diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index b6915a63947..6e81f6f8457 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -12,7 +12,7 @@ import cudf from cudf.core import DataFrame -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq from cudf.utils import queryutils _params_query_parser = [] diff --git a/python/cudf/cudf/tests/test_query_mask.py b/python/cudf/cudf/tests/test_query_mask.py index 35479f8308c..08218a3bdbf 100644 --- a/python/cudf/cudf/tests/test_query_mask.py +++ b/python/cudf/cudf/tests/test_query_mask.py @@ -3,7 +3,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq _data = [ {"a": [0, 1.0, 2.0, None, np.nan, None, 3, 5]}, diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index c86b2c61aa5..dfd8d4824cd 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -7,7 +7,7 @@ import pytest from cudf.core import DataFrame -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing.utils import assert_eq, assert_exceptions_equal @pytest.fixture diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 80a2e89bf46..53b07a253ff 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -11,8 +11,8 @@ import cudf from cudf.core import Series -from cudf.tests import utils -from cudf.tests.utils import NUMERIC_TYPES, gen_rand +from cudf.testing import utils +from cudf.testing.utils import NUMERIC_TYPES, gen_rand params_dtype = NUMERIC_TYPES diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index e7baa4ee926..7c7dd948e13 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -8,7 +8,7 @@ import cudf from cudf.core import DataFrame, Series -from cudf.tests.utils import ( +from cudf.testing.utils import ( INTEGER_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 7c274734980..d9de9335889 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -10,7 +10,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.tests import utils +from cudf.testing import utils from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index b030924779d..d25a2dd68ac 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -9,7 +9,7 @@ import cudf from cudf import melt as cudf_melt from cudf.core._compat import PANDAS_GE_120 -from cudf.tests.utils import ( +from cudf.testing.utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index fcc5591adda..76e09eb5069 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -8,7 +8,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 2eefcfef7d2..300a4f6e917 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -14,7 +14,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq moto = pytest.importorskip("moto", minversion="1.3.14") boto3 = pytest.importorskip("boto3") diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 003e46c7e0d..6e8830fb207 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -9,7 +9,7 @@ import cudf from cudf import Scalar as pycudf_scalar from cudf._lib.copying import get_element -from cudf.tests.utils import ( +from cudf.testing.utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py index dce65947460..33e9953c9ab 100644 --- a/python/cudf/cudf/tests/test_scan.py +++ b/python/cudf/cudf/tests/test_scan.py @@ -5,7 +5,12 @@ import pytest import cudf -from cudf.tests.utils import INTEGER_TYPES, NUMERIC_TYPES, assert_eq, gen_rand +from cudf.testing.utils import ( + INTEGER_TYPES, + NUMERIC_TYPES, + assert_eq, + gen_rand, +) params_sizes = [0, 1, 2, 5] diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py index 4c42e2cb50f..28da93d3401 100644 --- a/python/cudf/cudf/tests/test_search.py +++ b/python/cudf/cudf/tests/test_search.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq, gen_rand, random_bitmask +from cudf.testing.utils import assert_eq, gen_rand, random_bitmask @pytest.mark.parametrize("side", ["left", "right"]) diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 656b66bf793..d76575eb8cc 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -8,8 +8,8 @@ import pytest import cudf -from cudf.tests import utils -from cudf.tests.utils import assert_eq +from cudf.testing import utils +from cudf.testing.utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index ab9d3d91f73..5b9ecc98b40 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -9,7 +9,7 @@ import pytest import cudf -from cudf.tests.utils import ( +from cudf.testing.utils import ( DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES, diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py index 324074b6021..be08b3ba9a4 100644 --- a/python/cudf/cudf/tests/test_seriesmap.py +++ b/python/cudf/cudf/tests/test_seriesmap.py @@ -4,12 +4,12 @@ from math import floor import numpy as np -import cudf import pandas as pd import pytest +import cudf from cudf import Series -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing.utils import assert_eq, assert_exceptions_equal def test_series_map_basic(): diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index fc885a13808..cd4eb577d43 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -6,7 +6,7 @@ import cudf from cudf.core._compat import PANDAS_GE_120 -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing.utils import assert_eq, assert_exceptions_equal @pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})]) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index b90aebc33dc..5c8b278aaba 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -9,7 +9,7 @@ from cudf.core import DataFrame, Series from cudf.core.column import NumericalColumn -from cudf.tests.utils import ( +from cudf.testing.utils import ( DATETIME_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py index 4551f48845f..23d149fe78d 100644 --- a/python/cudf/cudf/tests/test_sparse_df.py +++ b/python/cudf/cudf/tests/test_sparse_df.py @@ -8,7 +8,7 @@ from cudf.comm.gpuarrow import GpuArrowReader from cudf.core import DataFrame, Series -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq def read_data(): diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 4e07c974280..ffa605d5782 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -9,7 +9,7 @@ import cudf from cudf.datasets import randomdata -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing.utils import assert_eq, assert_exceptions_equal params_dtypes = [np.int32, np.uint32, np.float32, np.float64] methods = ["min", "max", "sum", "mean", "var", "std"] diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index a015f3387b4..09c6a0ee146 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -16,7 +16,7 @@ from cudf.core._compat import PANDAS_GE_110 from cudf.core.column.string import StringColumn from cudf.core.index import StringIndex, as_index -from cudf.tests.utils import ( +from cudf.testing.utils import ( DATETIME_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index c7efb55c089..abfe1f3a73c 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -4,7 +4,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index eee7078433d..a90c991937b 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -10,7 +10,7 @@ assert_index_equal, assert_series_equal, ) -from cudf.tests.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq +from cudf.testing.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq @pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]]) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 74465c4a54d..be01e6f7c48 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq def test_tokenize(): diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 3efc30af01e..524731295fc 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -11,8 +11,8 @@ import cudf from cudf.core._compat import PANDAS_GE_120 -from cudf.tests import utils as utils -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing import utils as utils +from cudf.testing.utils import assert_eq, assert_exceptions_equal _TIMEDELTA_DATA = [ [1000000, 200000, 3000000], diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py index 6ec5f88be48..3e77f74d350 100644 --- a/python/cudf/cudf/tests/test_transform.py +++ b/python/cudf/cudf/tests/test_transform.py @@ -6,7 +6,7 @@ import pytest from cudf.core import Series -from cudf.tests.utils import NUMERIC_TYPES +from cudf.testing.utils import NUMERIC_TYPES supported_types = NUMERIC_TYPES diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index f132271cfd8..61e437ef9e8 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -10,7 +10,7 @@ import cudf from cudf.core import Series -from cudf.tests import utils +from cudf.testing import utils _unaops = [operator.abs, operator.invert, operator.neg, np.ceil, np.floor] diff --git a/python/custreamz/custreamz/tests/test_kafka.py b/python/custreamz/custreamz/tests/test_kafka.py index 059655d4ca0..157b021a0b7 100644 --- a/python/custreamz/custreamz/tests/test_kafka.py +++ b/python/custreamz/custreamz/tests/test_kafka.py @@ -2,7 +2,7 @@ import confluent_kafka as ck import pytest -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq @pytest.mark.parametrize("commit_offset", [-1, 0, 1, 1000]) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 76589682717..0ede420ed2e 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -8,7 +8,7 @@ import dask_cudf as dgd from cudf import DataFrame, Series -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq ############################################################################# # Datetime Accessor # diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index e19fe016cc9..6091d0a5681 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -12,10 +12,10 @@ from dask.dataframe.core import make_meta, meta_nonempty from dask.utils import M -import cudf - import dask_cudf as dgd +import cudf + def test_from_cudf(): np.random.seed(0) @@ -706,7 +706,7 @@ def test_dataframe_set_index(): pddf = dd.from_pandas(pdf, npartitions=4) pddf = pddf.set_index("str") - from cudf.tests.utils import assert_eq + from cudf.testing.utils import assert_eq assert_eq(ddf.compute(), pddf.compute()) diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index cb3c696adc3..7e4adace212 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -9,7 +9,7 @@ import dask_cudf import cudf -from cudf.tests.utils import assert_eq +from cudf.testing.utils import assert_eq dask_cuda = pytest.importorskip("dask_cuda") From b49ed25b68b279167d3b2d143193f4e212fc42d9 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 4 Mar 2021 15:34:48 -0600 Subject: [PATCH 03/54] add workaround for creating an arrow table from CUDABuffer --- python/cudf/cudf/comm/gpuarrow.py | 29 ++++++++++++------- .../cudf/cudf/tests/test_gpu_arrow_parser.py | 4 ++- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py index 451572224c6..16ddb582605 100644 --- a/python/cudf/cudf/comm/gpuarrow.py +++ b/python/cudf/cudf/comm/gpuarrow.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. + from collections import OrderedDict from collections.abc import Sequence @@ -9,7 +10,7 @@ from cudf._lib.gpuarrow import ( CudaRecordBatchStreamReader as _CudaRecordBatchStreamReader, ) -from cudf.core import Series, column +from cudf.core import DataFrame, Series, column from cudf.utils.utils import mask_bitsize, mask_dtype @@ -33,19 +34,25 @@ def __init__(self, source, schema=None): class GpuArrowReader(Sequence): def __init__(self, schema, dev_ary): - self._table = CudaRecordBatchStreamReader(dev_ary, schema).read_all() + table = CudaRecordBatchStreamReader(dev_ary, schema).read_all() + self._df = DataFrame.from_arrow(table) + self._schema = pa.Schema.from_pandas(self._df) def __len__(self): - return self._table.num_columns + return len(self._df._data.names) def __getitem__(self, idx): - return GpuArrowNodeReader(self._table, idx) + return GpuArrowNodeReader( + schema=self._schema, + field=self._schema[idx], + series=self._df._data.columns[idx], + ) def schema(self): """ Return a pyarrow schema """ - return self._table.schema + return self._schema def to_dict(self): """ @@ -58,10 +65,10 @@ def to_dict(self): class GpuArrowNodeReader(object): - def __init__(self, table, index): - self._table = table - self._field = table.schema[index] - self._series = Series(column.as_column(table.column(index))) + def __init__(self, schema, field, series): + self._schema = schema + self._field = field + self._series = Series(column.as_column(series)) self._series.name = self.name def __len__(self): @@ -69,7 +76,7 @@ def __len__(self): @property def schema(self): - return self._table.schema + return self._schema @property def field_schema(self): diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py index 96b051a53b4..67c83b9a917 100644 --- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py +++ b/python/cudf/cudf/tests/test_gpu_arrow_parser.py @@ -1,4 +1,5 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. + import logging import numpy as np @@ -178,6 +179,7 @@ def make_gpu_parse_arrow_cats_batch(): def test_gpu_parse_arrow_cats(): + pytest.xfail(reason="need dictionary mapping in libcudf from_arrow") batch = make_gpu_parse_arrow_cats_batch() stream = pa.BufferOutputStream() From c917739f5c1842988cda885bf51b3ad41d8e9601 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 4 Mar 2021 18:24:14 -0600 Subject: [PATCH 04/54] revert cat changes, and make testing utils internal --- .../cudf/_fuzz_testing/tests/fuzz_test_csv.py | 2 +- .../_fuzz_testing/tests/fuzz_test_json.py | 2 +- python/cudf/cudf/_fuzz_testing/utils.py | 2 +- python/cudf/cudf/_lib/transpose.pyx | 6 +-- python/cudf/cudf/_lib/utils.pyx | 8 +++- python/cudf/cudf/core/column/categorical.py | 37 +++++++++++-------- python/cudf/cudf/core/column/column.py | 16 +++++--- python/cudf/cudf/core/dataframe.py | 7 +++- python/cudf/cudf/core/frame.py | 7 ++-- python/cudf/cudf/core/index.py | 14 ++++--- python/cudf/cudf/io/csv.py | 2 +- .../cudf/cudf/testing/{utils.py => _utils.py} | 0 python/cudf/cudf/testing/testing.py | 4 +- python/cudf/cudf/tests/test_apply_rows.py | 2 +- python/cudf/cudf/tests/test_array_function.py | 2 +- python/cudf/cudf/tests/test_array_ufunc.py | 2 +- .../test_avro_reader_fastavro_integration.py | 2 +- python/cudf/cudf/tests/test_categorical.py | 2 +- python/cudf/cudf/tests/test_column.py | 2 +- .../cudf/cudf/tests/test_column_accessor.py | 2 +- python/cudf/cudf/tests/test_concat.py | 2 +- python/cudf/cudf/tests/test_contains.py | 2 +- python/cudf/cudf/tests/test_copying.py | 2 +- python/cudf/cudf/tests/test_csv.py | 2 +- python/cudf/cudf/tests/test_cuda_apply.py | 2 +- .../cudf/tests/test_cuda_array_interface.py | 2 +- .../cudf/cudf/tests/test_custom_accessor.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 6 +-- python/cudf/cudf/tests/test_dataframe_copy.py | 2 +- python/cudf/cudf/tests/test_datasets.py | 2 +- python/cudf/cudf/tests/test_datetime.py | 2 +- python/cudf/cudf/tests/test_dlpack.py | 2 +- python/cudf/cudf/tests/test_dropna.py | 2 +- python/cudf/cudf/tests/test_dtypes.py | 2 +- python/cudf/cudf/tests/test_duplicates.py | 2 +- python/cudf/cudf/tests/test_factorize.py | 2 +- python/cudf/cudf/tests/test_feather.py | 2 +- python/cudf/cudf/tests/test_fill.py | 2 +- python/cudf/cudf/tests/test_gcs.py | 2 +- .../cudf/cudf/tests/test_gpu_arrow_parser.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 2 +- python/cudf/cudf/tests/test_hdf.py | 2 +- python/cudf/cudf/tests/test_hdfs.py | 2 +- python/cudf/cudf/tests/test_index.py | 2 +- python/cudf/cudf/tests/test_indexing.py | 2 +- python/cudf/cudf/tests/test_interval.py | 2 +- python/cudf/cudf/tests/test_joining.py | 2 +- python/cudf/cudf/tests/test_json.py | 2 +- python/cudf/cudf/tests/test_list.py | 2 +- python/cudf/cudf/tests/test_monotonic.py | 2 +- python/cudf/cudf/tests/test_multiindex.py | 2 +- python/cudf/cudf/tests/test_numerical.py | 2 +- python/cudf/cudf/tests/test_numpy_interop.py | 2 +- python/cudf/cudf/tests/test_ops.py | 2 +- python/cudf/cudf/tests/test_orc.py | 2 +- python/cudf/cudf/tests/test_pandas_interop.py | 2 +- python/cudf/cudf/tests/test_parquet.py | 2 +- python/cudf/cudf/tests/test_pickling.py | 2 +- python/cudf/cudf/tests/test_quantiles.py | 2 +- python/cudf/cudf/tests/test_query.py | 2 +- python/cudf/cudf/tests/test_query_mask.py | 2 +- python/cudf/cudf/tests/test_rank.py | 2 +- python/cudf/cudf/tests/test_reductions.py | 2 +- python/cudf/cudf/tests/test_replace.py | 2 +- python/cudf/cudf/tests/test_reshape.py | 2 +- python/cudf/cudf/tests/test_rolling.py | 2 +- python/cudf/cudf/tests/test_s3.py | 2 +- python/cudf/cudf/tests/test_scalar.py | 2 +- python/cudf/cudf/tests/test_scan.py | 2 +- python/cudf/cudf/tests/test_search.py | 2 +- python/cudf/cudf/tests/test_serialize.py | 2 +- python/cudf/cudf/tests/test_series.py | 2 +- python/cudf/cudf/tests/test_seriesmap.py | 2 +- python/cudf/cudf/tests/test_setitem.py | 2 +- python/cudf/cudf/tests/test_sorting.py | 2 +- python/cudf/cudf/tests/test_sparse_df.py | 2 +- python/cudf/cudf/tests/test_stats.py | 2 +- python/cudf/cudf/tests/test_string.py | 2 +- python/cudf/cudf/tests/test_struct.py | 2 +- python/cudf/cudf/tests/test_testing.py | 2 +- python/cudf/cudf/tests/test_text.py | 2 +- python/cudf/cudf/tests/test_timedelta.py | 2 +- python/cudf/cudf/tests/test_transform.py | 2 +- .../custreamz/custreamz/tests/test_kafka.py | 2 +- .../dask_cudf/tests/test_accessor.py | 2 +- python/dask_cudf/dask_cudf/tests/test_core.py | 2 +- .../dask_cudf/tests/test_distributed.py | 2 +- 87 files changed, 142 insertions(+), 117 deletions(-) rename python/cudf/cudf/testing/{utils.py => _utils.py} (100%) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py index ff2ccd19696..9b6abeb1276 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py @@ -13,7 +13,7 @@ compare_content, run_test, ) -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pythonfuzz(data_handle=CSVReader) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py index 6c9d564d088..2f5e6204f7c 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py @@ -9,7 +9,7 @@ from cudf._fuzz_testing.json import JSONReader, JSONWriter from cudf._fuzz_testing.main import pythonfuzz from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES, run_test -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pythonfuzz(data_handle=JSONReader) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 6869e40492c..f1b95173c3d 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -9,7 +9,7 @@ import pyorc import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq from cudf.utils.dtypes import ( pandas_dtypes_to_cudf_dtypes, pyarrow_dtypes_to_pandas_dtypes, diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index fd02a08c49e..d2b053789cd 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -36,11 +36,11 @@ def transpose(Table source): if is_categorical_dtype(dtype): if any(not is_categorical_dtype(c.dtype) for c in source._columns): raise ValueError('Columns must all have the same dtype') - cats = list(c.cat.categories for c in source._columns) + cats = list(c.cat().categories for c in source._columns) cats = cudf.Series(cudf.concat(cats)).drop_duplicates()._column source = Table(index=source._index, data=[ - (name, col.cat._set_categories( - col.cat.categories, cats, is_unique=True).codes) + (name, col.cat()._set_categories( + col.cat().categories, cats, is_unique=True).codes) for name, col in source._data.items() ]) elif dtype.kind in 'OU': diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 26101c8bf7f..58e6f5f454f 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -116,7 +116,13 @@ cpdef generate_pandas_metadata(Table table, index): index_descriptors.append(descr) metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=table._data.columns, + columns_to_convert=[ + col.to_pandas() + if isinstance(col, cudf.core.column.CategoricalColumn) + else + col + for col in table._data.columns + ], df=table, column_names=col_names, index_levels=index_levels, diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7f8f1e7228f..e0e56edb2f1 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -936,9 +936,9 @@ def ordered(self) -> Optional[bool]: def ordered(self, value: bool): self.dtype.ordered = value - @property - def cat(self): - return CategoricalAccessor(self, parent=None) + # @property + def cat(self, parent: ParentType = None): + return CategoricalAccessor(self, parent=parent) def unary_operator(self, unaryop: str): raise TypeError( @@ -1086,7 +1086,7 @@ def to_pandas( col = self signed_dtype = min_signed_type(len(col.categories)) - codes = col.cat.codes.astype(signed_dtype).fillna(-1).to_array() + codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array() categories = col.categories.dropna(drop_nan=True).to_pandas() data = pd.Categorical.from_codes( codes, categories=categories, ordered=col.ordered @@ -1199,11 +1199,13 @@ def find_and_replace( # named 'index', which came from the filtered categories, # contains the new ints that we need to map to to_replace_col = column.as_column(catmap.index).astype( - self.cat.codes.dtype + self.cat().codes.dtype + ) + replacement_col = catmap["index"]._column.astype( + self.cat().codes.dtype ) - replacement_col = catmap["index"]._column.astype(self.cat.codes.dtype) - replaced = column.as_column(self.cat.codes) + replaced = column.as_column(self.cat().codes) output = libcudf.replace.replace( replaced, to_replace_col, replacement_col ) @@ -1281,8 +1283,10 @@ def fillna( ) # TODO: only required if fill_value has a subset of the # categories: - fill_value = fill_value.cat._set_categories( - fill_value.cat.categories, self.categories, is_unique=True, + fill_value = fill_value.cat()._set_categories( + fill_value.cat().categories, + self.categories, + is_unique=True, ) fill_value = column.as_column(fill_value.codes).astype( self.codes.dtype @@ -1360,7 +1364,7 @@ def as_categorical_column( # return a column full of Nulls. return _create_empty_categorical_column(self, dtype) - return self.cat.set_categories( + return self.cat().set_categories( new_categories=dtype.categories, ordered=dtype.ordered ) @@ -1385,8 +1389,8 @@ def as_timedelta_column(self, dtype, **kwargs) -> TimeDeltaColumn: def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): # self.categories is empty; just return codes - return self.cat.codes._column - gather_map = self.cat.codes.astype("int32").fillna(0)._column + return self.cat().codes._column + gather_map = self.cat().codes.astype("int32").fillna(0)._column out = self.categories.take(gather_map) out = out.set_mask(self.mask) return out @@ -1419,7 +1423,9 @@ def copy(self, deep: bool = True) -> CategoricalColumn: ) def __sizeof__(self) -> int: - return self.cat.categories.__sizeof__() + self.cat.codes.__sizeof__() + return ( + self.cat().categories.__sizeof__() + self.cat().codes.__sizeof__() + ) def _memory_usage(self, **kwargs) -> int: deep = kwargs.get("deep", False) @@ -1427,7 +1433,8 @@ def _memory_usage(self, **kwargs) -> int: return self.__sizeof__() else: return ( - self.categories._memory_usage() + self.cat.codes.memory_usage() + self.categories._memory_usage() + + self.cat().codes.memory_usage() ) def _mimic_inplace( @@ -1453,7 +1460,7 @@ def _create_empty_categorical_column( cudf.utils.utils.scalar_broadcast_to( categorical_column.default_na_value(), categorical_column.size, - np.dtype(categorical_column.cat.codes), + np.dtype(categorical_column.cat().codes), ) ), offset=categorical_column.offset, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index eec28c23bea..8e691773e3a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -189,8 +189,10 @@ def __sizeof__(self) -> int: n += bitmask_allocation_size_bytes(self.size) return n - @property - def cat(self) -> "cudf.core.column.categorical.CategoricalAccessor": + # @property + def cat( + self, parent=None + ) -> "cudf.core.column.categorical.CategoricalAccessor": raise NotImplementedError() def str(self, parent=None) -> "cudf.core.column.string.StringMethods": @@ -253,19 +255,21 @@ def _concat( if is_categorical: # Combine and de-dupe the categories cats = ( - cudf.concat([o.cat.categories for o in objs]) + cudf.concat([o.cat().categories for o in objs]) .to_series() .drop_duplicates(ignore_index=True) ._column ) objs = [ - o.cat._set_categories(o.cat.categories, cats, is_unique=True) + o.cat()._set_categories( + o.cat().categories, cats, is_unique=True + ) for o in objs ] # Map `objs` into a list of the codes until we port Categorical to # use the libcudf++ Category data type. - objs = [o.cat.codes._column for o in objs] - head = head.cat.codes._column + objs = [o.cat().codes._column for o in objs] + head = head.cat().codes._column newsize = sum(map(len, objs)) if newsize > libcudf.MAX_COLUMN_SIZE: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c0e1b8995aa..955b9bfbaa6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5396,7 +5396,12 @@ def to_arrow(self, preserve_index=True): out = super(DataFrame, data).to_arrow() metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=self._data.columns, + columns_to_convert=[ + col.to_pandas() + if isinstance(col, cudf.core.column.CategoricalColumn) + else col + for col in self._data.columns + ], df=self, column_names=out.schema.names, index_levels=[self.index], diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f413537bbeb..926aad368b0 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3792,7 +3792,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): ): # Combine and de-dupe the categories categories[idx] = ( - cudf.concat([col.cat.categories for col in cols]) + cudf.concat([col.cat().categories for col in cols]) .to_series() .drop_duplicates(ignore_index=True) ._column @@ -3823,8 +3823,9 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): if idx in categories: cols[idx] = ( cols[idx] - .cat._set_categories( - cols[idx].cat.categories, + .cat() + ._set_categories( + cols[idx].cat().categories, categories[idx], is_unique=True, ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b605d51af90..8c86352b2a7 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2672,15 +2672,17 @@ def __new__( dtype = None if categories is not None: - data.cat.set_categories(categories, ordered=ordered, inplace=True) + data.cat().set_categories( + categories, ordered=ordered, inplace=True + ) elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): - data.cat.set_categories( + data.cat().set_categories( dtype.categories, ordered=ordered, inplace=True ) elif ordered is True and data.ordered is False: - data.cat.as_ordered(inplace=True) + data.cat().as_ordered(inplace=True) elif ordered is False and data.ordered is True: - data.cat.as_unordered(inplace=True) + data.cat().as_unordered(inplace=True) out._initialize(data, **kwargs) @@ -2691,14 +2693,14 @@ def codes(self): """ The category codes of this categorical. """ - return self._values.cat.codes + return self._values.cat().codes @property def categories(self): """ The categories of this categorical. """ - return self._values.cat.categories + return self._values.cat().categories class StringIndex(GenericIndex): diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index f9b60cfce0b..e2c7ca7dca1 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -177,7 +177,7 @@ def to_csv( df = df.copy(deep=False) for col_name, col in df._data.items(): if isinstance(col, cudf.core.column.CategoricalColumn): - df._data[col_name] = col.astype(col.cat.categories.dtype) + df._data[col_name] = col.astype(col.cat().categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): df.index = df.index.astype(df.index.categories.dtype) diff --git a/python/cudf/cudf/testing/utils.py b/python/cudf/cudf/testing/_utils.py similarity index 100% rename from python/cudf/cudf/testing/utils.py rename to python/cudf/cudf/testing/_utils.py diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 9930327d89d..bacab24a6f3 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -162,8 +162,8 @@ def assert_column_equal( if check_exact and check_categorical: if is_categorical_dtype(left) and is_categorical_dtype(right): - left_cat = left.cat.categories - right_cat = right.cat.categories + left_cat = left.cat().categories + right_cat = right.cat().categories if check_category_order: assert_index_equal( diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py index 721e13b670f..f025549971f 100644 --- a/python/cudf/cudf/tests/test_apply_rows.py +++ b/python/cudf/cudf/tests/test_apply_rows.py @@ -2,7 +2,7 @@ import cudf from cudf.core.column import column -from cudf.testing.utils import assert_eq, gen_rand_series +from cudf.testing._utils import assert_eq, gen_rand_series def _kernel_multiply(a, b, out): diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 71804cb717e..cd4dd28f179 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -4,7 +4,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq from cudf.utils.utils import IS_NEP18_ACTIVE missing_arrfunc_cond = not IS_NEP18_ACTIVE diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index c888d32276b..8cfcf4d2b6d 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -4,7 +4,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.fixture diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 050f98d5ed5..48e3b0ec42c 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -18,7 +18,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq def cudf_from_avro_util(schema, records): diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 042bdea81f5..eaafcc468b2 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -9,7 +9,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.testing.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal @pytest.fixture diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 10192ea85ba..e1410423387 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -8,7 +8,7 @@ import cudf from cudf._lib.transform import mask_to_bools from cudf.core.column.column import as_column -from cudf.testing.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal from cudf.utils import dtypes as dtypeutils dtypes = sorted( diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 7342c04d0db..99d4bdd9910 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -6,7 +6,7 @@ import cudf from cudf.core.column_accessor import ColumnAccessor -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq simple_test_data = [ {}, diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index beb505b34d0..88cd1cadeb8 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -7,7 +7,7 @@ import pytest import cudf as gd -from cudf.testing.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal from cudf.utils.dtypes import is_categorical_dtype diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index ee5e87c2b2c..1d1deca5bd6 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -5,7 +5,7 @@ from cudf import Series from cudf.core.index import RangeIndex, as_index -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq def cudf_date_series(start, stop, freq): diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 249cdd3c310..0965b5298a4 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -6,7 +6,7 @@ import cudf from cudf.core import Series -from cudf.testing.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq +from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 3633873c73d..e01a8387abd 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -15,7 +15,7 @@ import cudf from cudf import read_csv -from cudf.testing.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal def make_numeric_dataframe(nrows, dtype): diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py index c608e67ac3e..2604030097b 100644 --- a/python/cudf/cudf/tests/test_cuda_apply.py +++ b/python/cudf/cudf/tests/test_cuda_apply.py @@ -9,7 +9,7 @@ from numba import cuda from cudf import DataFrame -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129]) diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 47ce8ac1132..041dc0076f8 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -10,7 +10,7 @@ from numba import cuda import cudf -from cudf.testing.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq +from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py index b032a8d0eda..16e5b345ce2 100644 --- a/python/cudf/cudf/tests/test_custom_accessor.py +++ b/python/cudf/cudf/tests/test_custom_accessor.py @@ -4,7 +4,7 @@ import pytest import cudf as gd -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @gd.api.extensions.register_dataframe_accessor("point") diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 60bd6a0b801..5073dd5b1e2 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -21,7 +21,7 @@ from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 from cudf.core.column import column from cudf.testing import utils -from cudf.testing.utils import ( +from cudf.testing._utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, @@ -5149,8 +5149,8 @@ def test_memory_usage_cat(): gdf = cudf.from_pandas(df) expected = ( - gdf.B._column.cat.categories.__sizeof__() - + gdf.B._column.cat.codes.__sizeof__() + gdf.B._column.cat().categories.__sizeof__() + + gdf.B._column.cat().codes.__sizeof__() ) # Check cat column diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py index 8c9a1b42ae7..5b258c760b3 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/test_dataframe_copy.py @@ -7,7 +7,7 @@ from numba import cuda from cudf.core.dataframe import DataFrame -from cudf.testing.utils import ALL_TYPES, assert_eq +from cudf.testing._utils import ALL_TYPES, assert_eq """ DataFrame copy expectations diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py index ccb66fc7306..b7bc89f008d 100644 --- a/python/cudf/cudf/tests/test_datasets.py +++ b/python/cudf/cudf/tests/test_datasets.py @@ -1,7 +1,7 @@ import numpy as np import cudf as gd -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq def test_dataset_timeseries(): diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index d572dbd4a36..8a65ed836f2 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -14,7 +14,7 @@ import cudf from cudf.core import DataFrame, Series from cudf.core.index import DatetimeIndex -from cudf.testing.utils import ( +from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index d72b6a49f72..4b2fca0d12d 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq nelems = [0, 3, 10] dtype = [np.uint16, np.int32, np.float64] diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index 684eed62168..e1d0c38c760 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 0f1a2b7fe59..0e547d97a32 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -13,7 +13,7 @@ ListDtype, StructDtype, ) -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq def test_cdt_basic(): diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index 457adbd4836..f464ac1a6c2 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -9,7 +9,7 @@ import cudf from cudf import concat -from cudf.testing.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal # TODO: PANDAS 1.0 support # Revisit drop_duplicates() tests to update parameters like ignore_index. diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index 60ba46277f4..0010079ac79 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -7,7 +7,7 @@ import cudf from cudf.core import DataFrame, Index -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py index 61362edb8b9..6c83ee3c458 100644 --- a/python/cudf/cudf/tests/test_feather.py +++ b/python/cudf/cudf/tests/test_feather.py @@ -10,7 +10,7 @@ import pytest import cudf -from cudf.testing.utils import NUMERIC_TYPES, assert_eq +from cudf.testing._utils import NUMERIC_TYPES, assert_eq if LooseVersion(pd.__version__) < LooseVersion("0.24"): try: diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py index e6904328065..efbe2834486 100644 --- a/python/cudf/cudf/tests/test_fill.py +++ b/python/cudf/cudf/tests/test_fill.py @@ -2,7 +2,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py index 181b31f5327..99d79e41520 100644 --- a/python/cudf/cudf/tests/test_gcs.py +++ b/python/cudf/cudf/tests/test_gcs.py @@ -10,7 +10,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq gcsfs = pytest.importorskip("gcsfs") diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py index 67c83b9a917..baf2fa62e38 100644 --- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py +++ b/python/cudf/cudf/tests/test_gpu_arrow_parser.py @@ -9,7 +9,7 @@ import cudf from cudf.comm.gpuarrow import GpuArrowReader -from cudf.testing.utils import INTEGER_TYPES +from cudf.testing._utils import INTEGER_TYPES def make_gpu_parse_arrow_data_batch(): diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index e6d91b87034..e5309e3b8b9 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -12,7 +12,7 @@ import cudf from cudf.core import DataFrame, Series from cudf.core._compat import PANDAS_GE_110 -from cudf.testing.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal _now = np.datetime64("now") _tomorrow = _now + np.timedelta64(1, "D") diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py index f1d573a5ca2..1bf91a52c2f 100644 --- a/python/cudf/cudf/tests/test_hdf.py +++ b/python/cudf/cudf/tests/test_hdf.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.testing.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq +from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq try: import tables # noqa F401 diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py index b26315b02fd..24554f113bb 100644 --- a/python/cudf/cudf/tests/test_hdfs.py +++ b/python/cudf/cudf/tests/test_hdfs.py @@ -11,7 +11,7 @@ from pyarrow import orc as orc import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq if not os.environ.get("RUN_HDFS_TESTS"): pytestmark = pytest.mark.skip("Env not configured to run HDFS tests") diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index c705ef98138..2bd3d4d09ce 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -20,7 +20,7 @@ RangeIndex, as_index, ) -from cudf.testing.utils import ( +from cudf.testing._utils import ( FLOAT_TYPES, NUMERIC_TYPES, OTHER_TYPES, diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 25a0694a4e5..3d6063d3419 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -10,7 +10,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 from cudf.testing import utils -from cudf.testing.utils import ( +from cudf.testing._utils import ( INTEGER_TYPES, assert_eq, assert_exceptions_equal, diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index 680ce6ee597..fc193441113 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -4,7 +4,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 4461a38fcf9..9babe519817 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -7,7 +7,7 @@ import cudf from cudf.core._compat import PANDAS_GE_120 from cudf.core.dtypes import CategoricalDtype -from cudf.testing.utils import ( +from cudf.testing._utils import ( INTEGER_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index ea85075d766..09ecb8a1efe 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -12,7 +12,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.testing.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq +from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq def make_numeric_dataframe(nrows, dtype): diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 07ddf0028f2..d2e1f46416c 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 86304fd3057..e9c828ec0f5 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -16,7 +16,7 @@ RangeIndex, StringIndex, ) -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)]) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 182fd1dc6ea..c8e5a9f071b 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -15,7 +15,7 @@ import cudf from cudf.core.column import as_column from cudf.core.index import as_index -from cudf.testing.utils import assert_eq, assert_exceptions_equal, assert_neq +from cudf.testing._utils import assert_eq, assert_exceptions_equal, assert_neq def test_multiindex_levels_codes_validation(): diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index ed23741b39c..3e014c98ea7 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -6,7 +6,7 @@ import cudf from cudf.core._compat import PANDAS_GE_100 -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq def test_can_cast_safely_same_kind(): diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py index 951af77b155..e5efe2f027d 100644 --- a/python/cudf/cudf/tests/test_numpy_interop.py +++ b/python/cudf/cudf/tests/test_numpy_interop.py @@ -2,7 +2,7 @@ import pytest from cudf.core import DataFrame, Series -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq def test_to_records_noindex(): diff --git a/python/cudf/cudf/tests/test_ops.py b/python/cudf/cudf/tests/test_ops.py index b7228739cfa..ac3f784ecd4 100644 --- a/python/cudf/cudf/tests/test_ops.py +++ b/python/cudf/cudf/tests/test_ops.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq, gen_rand +from cudf.testing._utils import assert_eq, gen_rand def test_sqrt_float(): diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index ff3c81a6cc1..8b323f269ff 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -13,7 +13,7 @@ import cudf from cudf.io.orc import ORCWriter -from cudf.testing.utils import ( +from cudf.testing._utils import ( assert_eq, gen_rand_series, supported_numpy_dtypes, diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py index ac1b39c9219..a8a45fc3c28 100644 --- a/python/cudf/cudf/tests/test_pandas_interop.py +++ b/python/cudf/cudf/tests/test_pandas_interop.py @@ -5,7 +5,7 @@ import cudf from cudf.core import DataFrame -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq def test_to_pandas(): diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 526e5adfba9..26c230cab5f 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -18,7 +18,7 @@ import cudf from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata from cudf.testing import dataset_generator as dg -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.fixture(scope="module") diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 6ca55e625bf..596af1d2686 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -8,7 +8,7 @@ from cudf.core import DataFrame, GenericIndex, Series from cudf.core.buffer import Buffer -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq if sys.version_info < (3, 8): try: diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py index f72a8a5fc71..4055485c49a 100644 --- a/python/cudf/cudf/tests/test_quantiles.py +++ b/python/cudf/cudf/tests/test_quantiles.py @@ -1,7 +1,7 @@ import pandas as pd import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq def test_single_q(): diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index 6e81f6f8457..8dc5df2dd7c 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -12,7 +12,7 @@ import cudf from cudf.core import DataFrame -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq from cudf.utils import queryutils _params_query_parser = [] diff --git a/python/cudf/cudf/tests/test_query_mask.py b/python/cudf/cudf/tests/test_query_mask.py index 08218a3bdbf..ab1c085c6c0 100644 --- a/python/cudf/cudf/tests/test_query_mask.py +++ b/python/cudf/cudf/tests/test_query_mask.py @@ -3,7 +3,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq _data = [ {"a": [0, 1.0, 2.0, None, np.nan, None, 3, 5]}, diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index dfd8d4824cd..3c98496def3 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -7,7 +7,7 @@ import pytest from cudf.core import DataFrame -from cudf.testing.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal @pytest.fixture diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 53b07a253ff..433f9d2f6ac 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -12,7 +12,7 @@ import cudf from cudf.core import Series from cudf.testing import utils -from cudf.testing.utils import NUMERIC_TYPES, gen_rand +from cudf.testing._utils import NUMERIC_TYPES, gen_rand params_dtype = NUMERIC_TYPES diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 7c7dd948e13..c2da31098d4 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -8,7 +8,7 @@ import cudf from cudf.core import DataFrame, Series -from cudf.testing.utils import ( +from cudf.testing._utils import ( INTEGER_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index d25a2dd68ac..0c4313eb47c 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -9,7 +9,7 @@ import cudf from cudf import melt as cudf_melt from cudf.core._compat import PANDAS_GE_120 -from cudf.testing.utils import ( +from cudf.testing._utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 76e09eb5069..07e7f43c992 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -8,7 +8,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 300a4f6e917..133597b8f19 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -14,7 +14,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq moto = pytest.importorskip("moto", minversion="1.3.14") boto3 = pytest.importorskip("boto3") diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 6e8830fb207..05a826415a7 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -9,7 +9,7 @@ import cudf from cudf import Scalar as pycudf_scalar from cudf._lib.copying import get_element -from cudf.testing.utils import ( +from cudf.testing._utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py index 33e9953c9ab..0f4cea5f812 100644 --- a/python/cudf/cudf/tests/test_scan.py +++ b/python/cudf/cudf/tests/test_scan.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.testing.utils import ( +from cudf.testing._utils import ( INTEGER_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py index 28da93d3401..c16c6486cd4 100644 --- a/python/cudf/cudf/tests/test_search.py +++ b/python/cudf/cudf/tests/test_search.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq, gen_rand, random_bitmask +from cudf.testing._utils import assert_eq, gen_rand, random_bitmask @pytest.mark.parametrize("side", ["left", "right"]) diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index d76575eb8cc..9fba750bae2 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -9,7 +9,7 @@ import cudf from cudf.testing import utils -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 5b9ecc98b40..8264017e905 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -9,7 +9,7 @@ import pytest import cudf -from cudf.testing.utils import ( +from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES, diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py index be08b3ba9a4..d4ef3ba235d 100644 --- a/python/cudf/cudf/tests/test_seriesmap.py +++ b/python/cudf/cudf/tests/test_seriesmap.py @@ -9,7 +9,7 @@ import cudf from cudf import Series -from cudf.testing.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal def test_series_map_basic(): diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 9ea891dbda4..921a6b1556a 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -6,7 +6,7 @@ import cudf from cudf.core._compat import PANDAS_EQ_123, PANDAS_GE_120 -from cudf.testing.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal @pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})]) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 5c8b278aaba..95942045654 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -9,7 +9,7 @@ from cudf.core import DataFrame, Series from cudf.core.column import NumericalColumn -from cudf.testing.utils import ( +from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py index 23d149fe78d..50c8f3f41a8 100644 --- a/python/cudf/cudf/tests/test_sparse_df.py +++ b/python/cudf/cudf/tests/test_sparse_df.py @@ -8,7 +8,7 @@ from cudf.comm.gpuarrow import GpuArrowReader from cudf.core import DataFrame, Series -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq def read_data(): diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index ffa605d5782..d4e944848c9 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -9,7 +9,7 @@ import cudf from cudf.datasets import randomdata -from cudf.testing.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal params_dtypes = [np.int32, np.uint32, np.float32, np.float64] methods = ["min", "max", "sum", "mean", "var", "std"] diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 6cc64a999aa..5fd3ffe43d5 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -17,7 +17,7 @@ from cudf.core._compat import PANDAS_GE_110 from cudf.core.column.string import StringColumn from cudf.core.index import StringIndex, as_index -from cudf.testing.utils import ( +from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index abfe1f3a73c..e89db6fa138 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -4,7 +4,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index a90c991937b..b2e5ea70ddc 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -10,7 +10,7 @@ assert_index_equal, assert_series_equal, ) -from cudf.testing.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq +from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq @pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]]) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index be01e6f7c48..79e9c68716e 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq def test_tokenize(): diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 524731295fc..07a4564d2ba 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -12,7 +12,7 @@ import cudf from cudf.core._compat import PANDAS_GE_120 from cudf.testing import utils as utils -from cudf.testing.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal _TIMEDELTA_DATA = [ [1000000, 200000, 3000000], diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py index 3e77f74d350..f9fadb15304 100644 --- a/python/cudf/cudf/tests/test_transform.py +++ b/python/cudf/cudf/tests/test_transform.py @@ -6,7 +6,7 @@ import pytest from cudf.core import Series -from cudf.testing.utils import NUMERIC_TYPES +from cudf.testing._utils import NUMERIC_TYPES supported_types = NUMERIC_TYPES diff --git a/python/custreamz/custreamz/tests/test_kafka.py b/python/custreamz/custreamz/tests/test_kafka.py index 157b021a0b7..d29ebf8db8b 100644 --- a/python/custreamz/custreamz/tests/test_kafka.py +++ b/python/custreamz/custreamz/tests/test_kafka.py @@ -2,7 +2,7 @@ import confluent_kafka as ck import pytest -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize("commit_offset", [-1, 0, 1, 1000]) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 0ede420ed2e..bfe4ca9d2e4 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -8,7 +8,7 @@ import dask_cudf as dgd from cudf import DataFrame, Series -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq ############################################################################# # Datetime Accessor # diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 6091d0a5681..a3a15a5da38 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -706,7 +706,7 @@ def test_dataframe_set_index(): pddf = dd.from_pandas(pdf, npartitions=4) pddf = pddf.set_index("str") - from cudf.testing.utils import assert_eq + from cudf.testing._utils import assert_eq assert_eq(ddf.compute(), pddf.compute()) diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index 7e4adace212..f16bf8889ce 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -9,7 +9,7 @@ import dask_cudf import cudf -from cudf.testing.utils import assert_eq +from cudf.testing._utils import assert_eq dask_cuda = pytest.importorskip("dask_cuda") From 5ff10d599c2d00f700cf2fbf0bcc214416536942 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 4 Mar 2021 18:25:41 -0600 Subject: [PATCH 05/54] Apply suggestions from code review --- python/cudf/cudf/core/column/categorical.py | 1 - python/cudf/cudf/core/column/column.py | 1 - 2 files changed, 2 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index e0e56edb2f1..c41a458f02b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -936,7 +936,6 @@ def ordered(self) -> Optional[bool]: def ordered(self, value: bool): self.dtype.ordered = value - # @property def cat(self, parent: ParentType = None): return CategoricalAccessor(self, parent=parent) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8e691773e3a..d6ea81d17f8 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -189,7 +189,6 @@ def __sizeof__(self) -> int: n += bitmask_allocation_size_bytes(self.size) return n - # @property def cat( self, parent=None ) -> "cudf.core.column.categorical.CategoricalAccessor": From 5d18fb78389f1535e014daf744f18c182ad8c24a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 11 Mar 2021 11:44:19 -0800 Subject: [PATCH 06/54] pyarrow 2.0 --- conda/recipes/cudf/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index f8ecb711d9b..e213bc06062 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -27,7 +27,7 @@ requirements: - setuptools - numba >=0.49.0 - dlpack - - pyarrow 3.0.0 + - pyarrow 2.0.0 - libcudf {{ version }} - rmm {{ minor_version }} - cudatoolkit {{ cuda_version }} From 202130495d29715cc8f987b23d3e51436a3ba618 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 11 Mar 2021 11:44:57 -0800 Subject: [PATCH 07/54] pyarrow 2.0 --- conda/recipes/libcudf/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 85aa0f08b48..01f429531e8 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -37,7 +37,7 @@ requirements: host: - librmm {{ minor_version }}.* - cudatoolkit {{ cuda_version }}.* - - arrow-cpp 3.0.0 + - arrow-cpp 2.0.0 - arrow-cpp-proc * cuda - boost-cpp 1.72.0 - dlpack From 3ff2c807799aea2abbae7092529134d407f35f24 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 16 Mar 2021 18:52:23 -0500 Subject: [PATCH 08/54] sync code-base --- python/cudf/cudf/tests/test_applymap.py | 2 +- python/cudf/cudf/tests/test_binops.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 2 +- python/cudf/cudf/tests/test_indexing.py | 2 +- python/cudf/cudf/tests/test_onehot.py | 2 +- python/cudf/cudf/tests/test_reductions.py | 2 +- python/cudf/cudf/tests/test_repr.py | 2 +- python/cudf/cudf/tests/test_serialize.py | 2 +- python/cudf/cudf/tests/test_unaops.py | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index eeacf05b33b..fa3c88a3551 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -7,7 +7,7 @@ import pytest from cudf import Series -from cudf.testing import utils +from cudf.testing import _utils as utils @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 32f64a61894..5f8412585d1 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -14,7 +14,7 @@ import cudf from cudf.core import Series from cudf.core.index import as_index -from cudf.testing import utils +from cudf.testing import _utils as utils from cudf.utils.dtypes import ( BOOL_TYPES, DATETIME_TYPES, diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 07a00d510eb..a79b6c73b0f 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -20,7 +20,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 from cudf.core.column import column -from cudf.testing import utils +from cudf.testing import _utils as utils from cudf.testing._utils import ( ALL_TYPES, DATETIME_TYPES, diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 3a35d459d5f..1e67b5208a2 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -9,7 +9,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 -from cudf.testing import utils +from cudf.testing import _utils as utils from cudf.testing._utils import ( INTEGER_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index 1cc2d90e501..286ba852356 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -8,7 +8,7 @@ import cudf from cudf.core import DataFrame, GenericIndex, Series -from cudf.testing import utils +from cudf.testing import _utils as utils def test_onehot_simple(): diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 433f9d2f6ac..0d96cbee942 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -11,7 +11,7 @@ import cudf from cudf.core import Series -from cudf.testing import utils +from cudf.testing import _utils as utils from cudf.testing._utils import NUMERIC_TYPES, gen_rand params_dtype = NUMERIC_TYPES diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index d9de9335889..30460ddee03 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -10,7 +10,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.testing import utils +from cudf.testing import _utils as utils from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 9fba750bae2..49eefe19616 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.testing import utils +from cudf.testing import _utils as utils from cudf.testing._utils import assert_eq diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index 61e437ef9e8..2089f764724 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -10,7 +10,7 @@ import cudf from cudf.core import Series -from cudf.testing import utils +from cudf.testing import _utils as utils _unaops = [operator.abs, operator.invert, operator.neg, np.ceil, np.floor] From 88794fe379024489c41572cd0f04c6f5b8f137e2 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 16 Mar 2021 19:10:20 -0500 Subject: [PATCH 09/54] fix imports --- python/cudf/cudf/tests/test_decimal.py | 10 +++------- python/cudf/cudf/tests/test_timedelta.py | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index ddf56828c3d..ed2782c8c58 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -5,15 +5,11 @@ import numpy as np import pyarrow as pa import pytest -import cudf -from cudf.core.dtypes import Decimal64Dtype +import cudf from cudf.core.column import DecimalColumn, NumericalColumn - -from cudf.tests.utils import ( - FLOAT_TYPES, - assert_eq, -) +from cudf.core.dtypes import Decimal64Dtype +from cudf.testing._utils import FLOAT_TYPES, assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 07a4564d2ba..a65fdeeb0dd 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -11,7 +11,7 @@ import cudf from cudf.core._compat import PANDAS_GE_120 -from cudf.testing import utils as utils +from cudf.testing import _utils as utils from cudf.testing._utils import assert_eq, assert_exceptions_equal _TIMEDELTA_DATA = [ From 90a860150f6a681ca2ddd0675922b0ff34162c8b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 17 Mar 2021 08:26:38 -0700 Subject: [PATCH 10/54] change arrow version --- conda/recipes/libcudf/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 709194f0530..368c55141f4 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -37,7 +37,7 @@ requirements: host: - librmm {{ minor_version }}.* - cudatoolkit {{ cuda_version }}.* - - arrow-cpp 2.0.0 + - arrow-cpp 3.0.0 - arrow-cpp-proc * cuda - boost-cpp 1.72.0 - dlpack From 0545fdd2eba1de9d8cf0ffdda878aaf27f13e3ef Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 6 Apr 2021 21:01:38 -0700 Subject: [PATCH 11/54] sync changes --- python/cudf/cudf/comm/gpuarrow.py | 29 +++++++++------------- python/cudf/cudf/testing/_utils.py | 33 ++++++++++++++++++++++++++ python/cudf/cudf/tests/test_binops.py | 2 +- python/cudf/cudf/tests/test_joining.py | 32 +------------------------ python/cudf/cudf/tests/test_string.py | 2 +- 5 files changed, 47 insertions(+), 51 deletions(-) diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py index 16ddb582605..451572224c6 100644 --- a/python/cudf/cudf/comm/gpuarrow.py +++ b/python/cudf/cudf/comm/gpuarrow.py @@ -1,5 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. - +# Copyright (c) 2019-2020, NVIDIA CORPORATION. from collections import OrderedDict from collections.abc import Sequence @@ -10,7 +9,7 @@ from cudf._lib.gpuarrow import ( CudaRecordBatchStreamReader as _CudaRecordBatchStreamReader, ) -from cudf.core import DataFrame, Series, column +from cudf.core import Series, column from cudf.utils.utils import mask_bitsize, mask_dtype @@ -34,25 +33,19 @@ def __init__(self, source, schema=None): class GpuArrowReader(Sequence): def __init__(self, schema, dev_ary): - table = CudaRecordBatchStreamReader(dev_ary, schema).read_all() - self._df = DataFrame.from_arrow(table) - self._schema = pa.Schema.from_pandas(self._df) + self._table = CudaRecordBatchStreamReader(dev_ary, schema).read_all() def __len__(self): - return len(self._df._data.names) + return self._table.num_columns def __getitem__(self, idx): - return GpuArrowNodeReader( - schema=self._schema, - field=self._schema[idx], - series=self._df._data.columns[idx], - ) + return GpuArrowNodeReader(self._table, idx) def schema(self): """ Return a pyarrow schema """ - return self._schema + return self._table.schema def to_dict(self): """ @@ -65,10 +58,10 @@ def to_dict(self): class GpuArrowNodeReader(object): - def __init__(self, schema, field, series): - self._schema = schema - self._field = field - self._series = Series(column.as_column(series)) + def __init__(self, table, index): + self._table = table + self._field = table.schema[index] + self._series = Series(column.as_column(table.column(index))) self._series.name = self.name def __len__(self): @@ -76,7 +69,7 @@ def __len__(self): @property def schema(self): - return self._schema + return self._table.schema @property def field_schema(self): diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 37a74ab4760..055535d2215 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -15,6 +15,8 @@ from cudf.core.column.datetime import _numpy_to_pandas_conversion from cudf.utils import dtypes as dtypeutils +_JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi") + supported_numpy_dtypes = [ "bool", "int8", @@ -299,3 +301,34 @@ def gen_rand_series(dtype, size, **kwargs): @contextmanager def does_not_raise(): yield + + +def assert_join_results_equal(expect, got, how, **kwargs): + if how not in _JOIN_TYPES: + raise ValueError(f"Unrecognized join type {how}") + if how == "right": + got = got[expect.columns] + + if isinstance(expect, (pd.Series, cudf.Series)): + return assert_eq( + expect.sort_values().reset_index(drop=True), + got.sort_values().reset_index(drop=True), + **kwargs, + ) + elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)): + if not len( + expect.columns + ): # can't sort_values() on a df without columns + return assert_eq(expect, got, **kwargs) + + assert_eq( + expect.sort_values(expect.columns.to_list()).reset_index( + drop=True + ), + got.sort_values(got.columns.to_list()).reset_index(drop=True), + **kwargs, + ) + elif isinstance(expect, (pd.Index, cudf.Index)): + return assert_eq(expect.sort_values(), got.sort_values(), **kwargs) + else: + raise ValueError(f"Not a join result: {type(expect).__name__}") diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 7a6fa526402..f205eae68e5 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -2037,7 +2037,7 @@ def test_binops_decimal(args): ), ], ) -@pytest.mark.parametrize("integer_dtype", cudf.tests.utils.INTEGER_TYPES) +@pytest.mark.parametrize("integer_dtype", utils.INTEGER_TYPES) @pytest.mark.parametrize("reflected", [True, False]) def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): """ diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index d1e748cf77b..73b71bee77d 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -12,6 +12,7 @@ NUMERIC_TYPES, assert_eq, assert_exceptions_equal, + assert_join_results_equal, ) _JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi") @@ -71,37 +72,6 @@ def pd_odd_joins(left, right, join_type): return left[left.index.isin(right.index)][left.columns] -def assert_join_results_equal(expect, got, how, **kwargs): - if how not in _JOIN_TYPES: - raise ValueError(f"Unrecognized join type {how}") - if how == "right": - got = got[expect.columns] - - if isinstance(expect, (pd.Series, cudf.Series)): - return assert_eq( - expect.sort_values().reset_index(drop=True), - got.sort_values().reset_index(drop=True), - **kwargs, - ) - elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)): - if not len( - expect.columns - ): # can't sort_values() on a df without columns - return assert_eq(expect, got, **kwargs) - - assert_eq( - expect.sort_values(expect.columns.to_list()).reset_index( - drop=True - ), - got.sort_values(got.columns.to_list()).reset_index(drop=True), - **kwargs, - ) - elif isinstance(expect, (pd.Index, cudf.Index)): - return assert_eq(expect.sort_values(), got.sort_values(), **kwargs) - else: - raise ValueError(f"Not a join result: {type(expect).__name__}") - - @pytest.mark.parametrize("aa,bb,how,method", make_params()) def test_dataframe_join_how(aa, bb, how, method): df = cudf.DataFrame() diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 668247a027d..32cf2592409 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -22,8 +22,8 @@ NUMERIC_TYPES, assert_eq, assert_exceptions_equal, + assert_join_results_equal, ) -from cudf.tests.test_joining import assert_join_results_equal from cudf.utils import dtypes as dtypeutils data_list = [ From baeabe97375bb5434df44ef61358a65ddaea7bc8 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 6 Apr 2021 21:03:54 -0700 Subject: [PATCH 12/54] bump up the arrow version --- conda/environments/cudf_dev_cuda10.1.yml | 4 ++-- conda/environments/cudf_dev_cuda10.2.yml | 4 ++-- conda/environments/cudf_dev_cuda11.0.yml | 4 ++-- conda/recipes/libcudf/meta.yaml | 2 +- cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index bbf63dd46e4..1fb1637e357 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -18,7 +18,7 @@ dependencies: - numba>=0.49.0,!=0.51.0 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=3.0.0 + - pyarrow=4.0.0 - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -46,7 +46,7 @@ dependencies: - distributed>=2.22.0,<=2021.3.1 - streamz - dlpack - - arrow-cpp=3.0.0 + - arrow-cpp=4.0.0 - arrow-cpp-proc * cuda - boost-cpp>=1.72.0 - double-conversion diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index 6a598ed4d37..a31d18ce254 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -18,7 +18,7 @@ dependencies: - numba>=0.49,!=0.51.0 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=3.0.0 + - pyarrow=4.0.0 - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -46,7 +46,7 @@ dependencies: - distributed>=2.22.0,<=2021.3.1 - streamz - dlpack - - arrow-cpp=3.0.0 + - arrow-cpp=4.0.0 - arrow-cpp-proc * cuda - boost-cpp>=1.72.0 - double-conversion diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 2bd3b70c617..ad24311b952 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -18,7 +18,7 @@ dependencies: - numba>=0.49,!=0.51.0 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=3.0.0 + - pyarrow=4.0.0 - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -46,7 +46,7 @@ dependencies: - distributed>=2.22.0,<=2021.3.1 - streamz - dlpack - - arrow-cpp=3.0.0 + - arrow-cpp=4.0.0 - arrow-cpp-proc * cuda - boost-cpp>=1.72.0 - double-conversion diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 7128c4a2f78..c3344ab7750 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -37,7 +37,7 @@ requirements: host: - librmm {{ minor_version }}.* - cudatoolkit {{ cuda_version }}.* - - arrow-cpp 3.0.0 + - arrow-cpp 4.0.0 - arrow-cpp-proc * cuda - boost-cpp 1.72.0 - dlpack diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index 752b84ea78b..1224c8bf728 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -120,6 +120,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) endfunction() -set(CUDF_VERSION_Arrow 3.0.0) +set(CUDF_VERSION_Arrow 4.0.0) find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC}) From 9984297a4632824f0931fae5c69a3b8f14b140a7 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 6 Apr 2021 21:07:45 -0700 Subject: [PATCH 13/54] remove xfail --- python/cudf/cudf/tests/test_gpu_arrow_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py index baf2fa62e38..a088ae9f923 100644 --- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py +++ b/python/cudf/cudf/tests/test_gpu_arrow_parser.py @@ -179,7 +179,6 @@ def make_gpu_parse_arrow_cats_batch(): def test_gpu_parse_arrow_cats(): - pytest.xfail(reason="need dictionary mapping in libcudf from_arrow") batch = make_gpu_parse_arrow_cats_batch() stream = pa.BufferOutputStream() From 707dcc0c987d5c46d93ab4ffa9e5357519f704b9 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 29 Apr 2021 09:47:48 -0700 Subject: [PATCH 14/54] update 11.2 yml --- conda/environments/cudf_dev_cuda11.2.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml index 24f7f7a1144..5debe8d86c0 100644 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ b/conda/environments/cudf_dev_cuda11.2.yml @@ -18,7 +18,7 @@ dependencies: - numba>=0.53.1 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=1.0.1 + - pyarrow=4.0.0 - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -46,7 +46,7 @@ dependencies: - distributed>=2.22.0,<=2021.4.0 - streamz - dlpack - - arrow-cpp=1.0.1 + - arrow-cpp=4.0.0 - arrow-cpp-proc * cuda - boost-cpp>=1.72.0 - double-conversion From 7005ed01d30c34e3c853eeda411aff0c9f6b24c0 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 1 Jun 2021 07:44:15 -0700 Subject: [PATCH 15/54] bump arrow patch version --- conda/environments/cudf_dev_cuda11.0.yml | 2 +- conda/environments/cudf_dev_cuda11.2.yml | 2 +- conda/recipes/libcudf/meta.yaml | 2 +- cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 2f4564a845e..4ddda14703c 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -17,7 +17,7 @@ dependencies: - numba>=0.53.1 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=4.0.0 + - pyarrow=4.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml index 59d3f33fa96..04cd8225931 100644 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ b/conda/environments/cudf_dev_cuda11.2.yml @@ -17,7 +17,7 @@ dependencies: - numba>=0.53.1 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=4.0.0 + - pyarrow=4.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index d12f0e1b5c3..17d9b05ea4f 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -37,7 +37,7 @@ requirements: host: - librmm {{ minor_version }}.* - cudatoolkit {{ cuda_version }}.* - - arrow-cpp 4.0.0 + - arrow-cpp 4.0.1 - arrow-cpp-proc * cuda - dlpack>=0.5,<0.6.0a0 run: diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index 3866c6ab64d..b9b6ab18d1f 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -121,6 +121,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) endfunction() -set(CUDF_VERSION_Arrow 4.0.0) +set(CUDF_VERSION_Arrow 4.0.1) find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC}) From 290bc16588d1e5e8cd963ee7abf5241bf875058a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 1 Jun 2021 08:37:18 -0700 Subject: [PATCH 16/54] remove stale code --- python/cudf/cudf/core/column/column.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f6283b18150..a58b2eda822 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -63,7 +63,6 @@ from cudf.utils.utils import mask_dtype T = TypeVar("T", bound="ColumnBase") -ParentType = Union["cudf.Series", "cudf.Index"] class ColumnBase(Column, Serializable): From 6cd394dd6b11fdd61ebc0f742c03298a529d85be Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 1 Jun 2021 08:46:39 -0700 Subject: [PATCH 17/54] temporary change --- ci/gpu/build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index e1ddfa1cc56..ed247cb637e 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -85,6 +85,7 @@ gpuci_conda_retry install -y \ # https://docs.rapids.ai/maintainers/depmgmt/ # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env # gpuci_conda_retry install -y "your-pkg=1.0.0" +gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1" gpuci_logger "Check compiler versions" From cab9539c4972f79fe82ebbc08e72bb40fb71d6c8 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 1 Jun 2021 09:33:37 -0700 Subject: [PATCH 18/54] remove to_pandas() --- python/cudf/cudf/_lib/utils.pyx | 3 --- python/cudf/cudf/core/dataframe.py | 7 +------ 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 199d6fb32a4..449d01357b0 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -139,9 +139,6 @@ cpdef generate_pandas_metadata(Table table, index): metadata = pa.pandas_compat.construct_metadata( columns_to_convert=[ - col.to_pandas() - if isinstance(col, cudf.core.column.CategoricalColumn) - else col for col in table._data.columns ], diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index eb42166aaa5..c21a807edb0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5666,12 +5666,7 @@ def to_arrow(self, preserve_index=True): out = super(DataFrame, data).to_arrow() metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=[ - col.to_pandas() - if isinstance(col, cudf.core.column.CategoricalColumn) - else col - for col in self._data.columns - ], + columns_to_convert=[self[col] for col in self._data.names], df=self, column_names=out.schema.names, index_levels=[self.index], From 173d7947ee9373a58880b33c937917d5c6e19d7e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 1 Jun 2021 09:36:52 -0700 Subject: [PATCH 19/54] imports --- python/dask_cudf/dask_cudf/tests/test_core.py | 4 ++-- python/dask_cudf/dask_cudf/tests/test_distributed.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 0110e5e38d5..07920863186 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -18,10 +18,10 @@ from dask.utils import M -import dask_cudf as dgd - import cudf +import dask_cudf as dgd + def test_from_cudf(): np.random.seed(0) diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index b7dc17f08b6..876a66f78d7 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -6,11 +6,11 @@ from dask.distributed import Client from distributed.utils_test import loop # noqa: F401 -import dask_cudf - import cudf from cudf.testing._utils import assert_eq +import dask_cudf + dask_cuda = pytest.importorskip("dask_cuda") From b9a90bf4753f93247c0d77c7674174543364d2a1 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 8 Jun 2021 13:10:54 -0500 Subject: [PATCH 20/54] temp commit --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index ed247cb637e..66cc0e307ad 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -83,7 +83,7 @@ gpuci_conda_retry install -y \ "ucx-py=${MINOR_VERSION}" # https://docs.rapids.ai/maintainers/depmgmt/ -# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env +gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env # gpuci_conda_retry install -y "your-pkg=1.0.0" gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1" From 0fb2c6d240d3c267850cec2d96b36257c53b510e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 14 Jun 2021 14:29:37 -0500 Subject: [PATCH 21/54] style --- python/cudf/cudf/tests/test_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index ea09670a662..1c384b57257 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -9,7 +9,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.testing._utils import NUMERIC_TYPES, assert_eq, assert_exceptions_equal +from cudf.testing._utils import NUMERIC_TYPES, assert_exceptions_equal, assert_eq @pytest.fixture From 6046433fe702128dd1cd8488c41ab726391320c4 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 14 Jun 2021 14:35:36 -0500 Subject: [PATCH 22/54] style --- python/cudf/cudf/tests/test_categorical.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 1c384b57257..dc9610176c9 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -9,7 +9,11 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.testing._utils import NUMERIC_TYPES, assert_exceptions_equal, assert_eq +from cudf.testing._utils import ( + NUMERIC_TYPES, + assert_eq, + assert_exceptions_equal, +) @pytest.fixture From 995b4a529c0341819eb7bbf3d133a1525493bffe Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 15 Jun 2021 14:07:53 -0500 Subject: [PATCH 23/54] temp commit --- ci/gpu/build.sh | 2 +- python/cudf/cudf/tests/test_cut.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 54a8e341c8b..6da273a08de 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -169,7 +169,7 @@ else for gt in gtests/* ; do test_name=$(basename ${gt}) echo "Running GoogleTest $test_name" - ${gt} --gtest_output=xml:"$WORKSPACE/test-results/" + ${gt} done CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"` diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/test_cut.py index 926826ac188..710df78e36b 100644 --- a/python/cudf/cudf/tests/test_cut.py +++ b/python/cudf/cudf/tests/test_cut.py @@ -4,11 +4,12 @@ Test related to Cut """ -import pandas as pd import numpy as np -from cudf.core.cut import cut +import pandas as pd import pytest -from cudf.tests.utils import assert_eq + +from cudf.core.cut import cut +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( From 7a199ad5d8a8b24c58da1c619eeccb76c74a24ee Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 16 Jun 2021 18:46:56 -0500 Subject: [PATCH 24/54] flush after writing second JSON line --- cpp/tests/io/arrow_io_source_test.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp index 24964db5f8c..72ddb87f19b 100644 --- a/cpp/tests/io/arrow_io_source_test.cpp +++ b/cpp/tests/io/arrow_io_source_test.cpp @@ -43,7 +43,8 @@ TEST_F(ArrowIOTest, URIFileSystem) { const std::string file_name = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; std::ofstream outfile(file_name, std::ofstream::out); - outfile << "[11, 1.1]\n[22, 2.2]"; + outfile << "[11, 1.1]" << std::endl; + outfile << "[22, 2.2]" << std::endl; outfile.close(); std::string file_uri = "file://" + file_name; From 79a364e785578cd568a867767b3565620fb9fd51 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 17 Jun 2021 08:25:45 -0700 Subject: [PATCH 25/54] add back xml output --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 8a1ee122dfc..d63ec64faf2 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -171,7 +171,7 @@ else for gt in gtests/* ; do test_name=$(basename ${gt}) echo "Running GoogleTest $test_name" - ${gt} + ${gt} --gtest_output=xml:"$WORKSPACE/test-results/" done CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"` From c9c9a153709d96658eeecc6cc3c5af756d8c98e3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 17 Jun 2021 15:39:22 -0700 Subject: [PATCH 26/54] temp commit --- cpp/tests/io/arrow_io_source_test.cpp | 35 ++++++++++++++++----------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp index 72ddb87f19b..3fdb3988194 100644 --- a/cpp/tests/io/arrow_io_source_test.cpp +++ b/cpp/tests/io/arrow_io_source_test.cpp @@ -46,20 +46,27 @@ TEST_F(ArrowIOTest, URIFileSystem) outfile << "[11, 1.1]" << std::endl; outfile << "[22, 2.2]" << std::endl; outfile.close(); - - std::string file_uri = "file://" + file_name; - std::unique_ptr datasource = - std::make_unique(file_uri); - - // Populate the JSON Reader Options - cudf::io::json_reader_options options = - cudf::io::json_reader_options::builder(cudf::io::source_info(datasource.get())).lines(true); - - // Read the JSON file from the LocalFileSystem - cudf::io::table_with_metadata tbl = cudf::io::read_json(options); - - ASSERT_EQ(2, tbl.tbl->num_columns()); - ASSERT_EQ(2, tbl.tbl->num_rows()); + std::string line; + std::ifstream myfile(file_name); + if (myfile.is_open()) { + while (getline(myfile, line)) { std::cout << line << '\n'; } + myfile.close(); + } else + std::cout << "Unable to open file"; + + // std::string file_uri = "file://" + file_name; + // std::unique_ptr datasource = + // std::make_unique(file_uri); + + // // Populate the JSON Reader Options + // cudf::io::json_reader_options options = + // cudf::io::json_reader_options::builder(cudf::io::source_info(datasource.get())).lines(true); + + // // Read the JSON file from the LocalFileSystem + // cudf::io::table_with_metadata tbl = cudf::io::read_json(options); + + // ASSERT_EQ(2, tbl.tbl->num_columns()); + // ASSERT_EQ(2, tbl.tbl->num_rows()); } #ifdef S3_ENABLED From 47e3abf2cebf4118679bd3f64b0bb80488fd3c32 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 17 Jun 2021 20:03:35 -0700 Subject: [PATCH 27/54] disable tests --- cpp/tests/io/arrow_io_source_test.cpp | 36 +++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp index 3fdb3988194..b3b26062dce 100644 --- a/cpp/tests/io/arrow_io_source_test.cpp +++ b/cpp/tests/io/arrow_io_source_test.cpp @@ -73,30 +73,30 @@ TEST_F(ArrowIOTest, URIFileSystem) TEST_F(ArrowIOTest, S3FileSystem) { - std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2"; - std::unique_ptr datasource = - std::make_unique(s3_uri); - - // Populate the Parquet Reader Options - cudf::io::source_info src(datasource.get()); - std::vector single_column; - single_column.insert(single_column.begin(), "total_bill"); - cudf::io::parquet_reader_options_builder builder(src); - cudf::io::parquet_reader_options options = builder.columns(single_column).build(); - - // Read the Parquet file from S3 - cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options); - - ASSERT_EQ(1, tbl.tbl->num_columns()); // Only single column specified in reader_options - ASSERT_EQ(244, tbl.tbl->num_rows()); // known number of rows from the S3 file + // std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2"; + // std::unique_ptr datasource = + // std::make_unique(s3_uri); + + // // Populate the Parquet Reader Options + // cudf::io::source_info src(datasource.get()); + // std::vector single_column; + // single_column.insert(single_column.begin(), "total_bill"); + // cudf::io::parquet_reader_options_builder builder(src); + // cudf::io::parquet_reader_options options = builder.columns(single_column).build(); + + // // Read the Parquet file from S3 + // cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options); + + // ASSERT_EQ(1, tbl.tbl->num_columns()); // Only single column specified in reader_options + // ASSERT_EQ(244, tbl.tbl->num_rows()); // known number of rows from the S3 file } #else TEST_F(ArrowIOTest, S3URIWhenNotEnabled) { - std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2"; - EXPECT_THROW(std::make_unique(s3_uri), cudf::logic_error); + // std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2"; + // EXPECT_THROW(std::make_unique(s3_uri), cudf::logic_error); } #endif From 4775068627b39d7aa274712a466be9c67430e77b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 18 Jun 2021 06:38:19 -0700 Subject: [PATCH 28/54] disable arrow test --- cpp/tests/io/csv_test.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index f2278267f74..a2835e576c4 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1060,25 +1060,25 @@ TEST_F(CsvReaderTest, HeaderOnlyFile) TEST_F(CsvReaderTest, ArrowFileSource) { - auto filepath = temp_env->get_temp_dir() + "ArrowFileSource.csv"; - { - std::ofstream outfile(filepath, std::ofstream::out); - outfile << "A\n9\n8\n7\n6\n5\n4\n3\n2\n"; - } + // auto filepath = temp_env->get_temp_dir() + "ArrowFileSource.csv"; + // { + // std::ofstream outfile(filepath, std::ofstream::out); + // outfile << "A\n9\n8\n7\n6\n5\n4\n3\n2\n"; + // } - std::shared_ptr infile; - ASSERT_TRUE(arrow::io::ReadableFile::Open(filepath).Value(&infile).ok()); + // std::shared_ptr infile; + // ASSERT_TRUE(arrow::io::ReadableFile::Open(filepath).Value(&infile).ok()); - auto arrow_source = cudf_io::arrow_io_source{infile}; - cudf_io::csv_reader_options in_opts = - cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source}).dtypes({"int8"}); - auto result = cudf_io::read_csv(in_opts); + // auto arrow_source = cudf_io::arrow_io_source{infile}; + // cudf_io::csv_reader_options in_opts = + // cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source}).dtypes({"int8"}); + // auto result = cudf_io::read_csv(in_opts); - const auto view = result.tbl->view(); - EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id()); + // const auto view = result.tbl->view(); + // EXPECT_EQ(1, view.num_columns()); + // ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id()); - expect_column_data_equal(std::vector{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0)); + // expect_column_data_equal(std::vector{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0)); } TEST_F(CsvReaderTest, InvalidFloatingPoint) From 626b9159bda7d18bf69a79cd6f3a30a654ad4056 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 22 Jun 2021 12:41:30 -0700 Subject: [PATCH 29/54] add arrow 4.0.1 in cpu builds --- ci/cpu/build.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index e11a0488624..c721c7f2a7f 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -41,6 +41,9 @@ env gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate rapids +gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env +# gpuci_conda_retry install -y "your-pkg=1.0.0" +gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1" # Remove rapidsai-nightly channel if we are building main branch if [ "$SOURCE_BRANCH" = "main" ]; then From a7b631e105f28d79f4b49438bde435b5bf622000 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 22 Jun 2021 13:09:03 -0700 Subject: [PATCH 30/54] conda install --- ci/cpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index c721c7f2a7f..1a335e0797b 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -41,7 +41,7 @@ env gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate rapids -gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env +gpuci_conda_retry remove --force arrow-cpp pyarrow "arrow-cpp-proc * cuda" # gpuci_conda_retry install -y "your-pkg=1.0.0" gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1" From 951df947aef6971e28d8e07122b8b36e22d110c9 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 22 Jun 2021 13:22:22 -0700 Subject: [PATCH 31/54] gpu packages --- ci/cpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 1a335e0797b..98782ef39f7 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -43,7 +43,7 @@ gpuci_logger "Activate conda env" conda activate rapids gpuci_conda_retry remove --force arrow-cpp pyarrow "arrow-cpp-proc * cuda" # gpuci_conda_retry install -y "your-pkg=1.0.0" -gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1" +gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda" # Remove rapidsai-nightly channel if we are building main branch if [ "$SOURCE_BRANCH" = "main" ]; then From 0c3570faef553bebfb1dc6f0c3b2b320b2a7c33b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 22 Jun 2021 14:10:17 -0700 Subject: [PATCH 32/54] -y --- ci/cpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 98782ef39f7..d9caf04a92e 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -41,7 +41,7 @@ env gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate rapids -gpuci_conda_retry remove --force arrow-cpp pyarrow "arrow-cpp-proc * cuda" +gpuci_conda_retry remove -y --force arrow-cpp pyarrow "arrow-cpp-proc * cuda" # gpuci_conda_retry install -y "your-pkg=1.0.0" gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda" From 6eecdc713510c104af25b0f81667a8075206bac4 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 22 Jun 2021 14:19:31 -0700 Subject: [PATCH 33/54] use conda --- ci/cpu/build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index d9caf04a92e..6c29e2c9381 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -41,9 +41,9 @@ env gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate rapids -gpuci_conda_retry remove -y --force arrow-cpp pyarrow "arrow-cpp-proc * cuda" +conda remove -y --force arrow-cpp pyarrow "arrow-cpp-proc * cuda" # gpuci_conda_retry install -y "your-pkg=1.0.0" -gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda" +conda install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda" # Remove rapidsai-nightly channel if we are building main branch if [ "$SOURCE_BRANCH" = "main" ]; then From 038349f43a0254331686bb5d115e1996edb1cae0 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 22 Jun 2021 14:24:39 -0700 Subject: [PATCH 34/54] fix --- ci/cpu/build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 6c29e2c9381..1dc18078fe5 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -41,9 +41,9 @@ env gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate rapids -conda remove -y --force arrow-cpp pyarrow "arrow-cpp-proc * cuda" +gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda' # gpuci_conda_retry install -y "your-pkg=1.0.0" -conda install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda" +gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda" # Remove rapidsai-nightly channel if we are building main branch if [ "$SOURCE_BRANCH" = "main" ]; then From 9f06543d1dcf52c24cc961cf6b7cc83ee3769039 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 22 Jun 2021 14:25:37 -0700 Subject: [PATCH 35/54] fix --- ci/cpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 1dc18078fe5..c94588b91c7 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -43,7 +43,7 @@ gpuci_logger "Activate conda env" conda activate rapids gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda' # gpuci_conda_retry install -y "your-pkg=1.0.0" -gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda" +gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' # Remove rapidsai-nightly channel if we are building main branch if [ "$SOURCE_BRANCH" = "main" ]; then From 18dac174dbfe0c606b72c140b9849ca53b686518 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 22 Jun 2021 16:28:34 -0500 Subject: [PATCH 36/54] Apply suggestions from code review Co-authored-by: jakirkham --- ci/cpu/build.sh | 2 +- ci/gpu/build.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index c94588b91c7..2c6cecf667b 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -43,7 +43,7 @@ gpuci_logger "Activate conda env" conda activate rapids gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda' # gpuci_conda_retry install -y "your-pkg=1.0.0" -gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' +gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' # Remove rapidsai-nightly channel if we are building main branch if [ "$SOURCE_BRANCH" = "main" ]; then diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 68e239544f3..22cf6e67827 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -85,7 +85,7 @@ gpuci_conda_retry install -y \ # https://docs.rapids.ai/maintainers/depmgmt/ gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env # gpuci_conda_retry install -y "your-pkg=1.0.0" -gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1" +gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' gpuci_logger "Check compiler versions" From 2cf75fbb0648e711c6e977bb7c5703fa10261d86 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 22 Jun 2021 16:32:38 -0700 Subject: [PATCH 37/54] revert disabling of arrow tests --- cpp/tests/io/arrow_io_source_test.cpp | 74 ++++++++++++--------------- cpp/tests/io/csv_test.cpp | 30 +++++------ 2 files changed, 48 insertions(+), 56 deletions(-) diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp index b3b26062dce..24964db5f8c 100644 --- a/cpp/tests/io/arrow_io_source_test.cpp +++ b/cpp/tests/io/arrow_io_source_test.cpp @@ -43,60 +43,52 @@ TEST_F(ArrowIOTest, URIFileSystem) { const std::string file_name = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; std::ofstream outfile(file_name, std::ofstream::out); - outfile << "[11, 1.1]" << std::endl; - outfile << "[22, 2.2]" << std::endl; + outfile << "[11, 1.1]\n[22, 2.2]"; outfile.close(); - std::string line; - std::ifstream myfile(file_name); - if (myfile.is_open()) { - while (getline(myfile, line)) { std::cout << line << '\n'; } - myfile.close(); - } else - std::cout << "Unable to open file"; - - // std::string file_uri = "file://" + file_name; - // std::unique_ptr datasource = - // std::make_unique(file_uri); - - // // Populate the JSON Reader Options - // cudf::io::json_reader_options options = - // cudf::io::json_reader_options::builder(cudf::io::source_info(datasource.get())).lines(true); - - // // Read the JSON file from the LocalFileSystem - // cudf::io::table_with_metadata tbl = cudf::io::read_json(options); - - // ASSERT_EQ(2, tbl.tbl->num_columns()); - // ASSERT_EQ(2, tbl.tbl->num_rows()); + + std::string file_uri = "file://" + file_name; + std::unique_ptr datasource = + std::make_unique(file_uri); + + // Populate the JSON Reader Options + cudf::io::json_reader_options options = + cudf::io::json_reader_options::builder(cudf::io::source_info(datasource.get())).lines(true); + + // Read the JSON file from the LocalFileSystem + cudf::io::table_with_metadata tbl = cudf::io::read_json(options); + + ASSERT_EQ(2, tbl.tbl->num_columns()); + ASSERT_EQ(2, tbl.tbl->num_rows()); } #ifdef S3_ENABLED TEST_F(ArrowIOTest, S3FileSystem) { - // std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2"; - // std::unique_ptr datasource = - // std::make_unique(s3_uri); - - // // Populate the Parquet Reader Options - // cudf::io::source_info src(datasource.get()); - // std::vector single_column; - // single_column.insert(single_column.begin(), "total_bill"); - // cudf::io::parquet_reader_options_builder builder(src); - // cudf::io::parquet_reader_options options = builder.columns(single_column).build(); - - // // Read the Parquet file from S3 - // cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options); - - // ASSERT_EQ(1, tbl.tbl->num_columns()); // Only single column specified in reader_options - // ASSERT_EQ(244, tbl.tbl->num_rows()); // known number of rows from the S3 file + std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2"; + std::unique_ptr datasource = + std::make_unique(s3_uri); + + // Populate the Parquet Reader Options + cudf::io::source_info src(datasource.get()); + std::vector single_column; + single_column.insert(single_column.begin(), "total_bill"); + cudf::io::parquet_reader_options_builder builder(src); + cudf::io::parquet_reader_options options = builder.columns(single_column).build(); + + // Read the Parquet file from S3 + cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options); + + ASSERT_EQ(1, tbl.tbl->num_columns()); // Only single column specified in reader_options + ASSERT_EQ(244, tbl.tbl->num_rows()); // known number of rows from the S3 file } #else TEST_F(ArrowIOTest, S3URIWhenNotEnabled) { - // std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2"; - // EXPECT_THROW(std::make_unique(s3_uri), cudf::logic_error); + std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2"; + EXPECT_THROW(std::make_unique(s3_uri), cudf::logic_error); } #endif diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index a2835e576c4..f2278267f74 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1060,25 +1060,25 @@ TEST_F(CsvReaderTest, HeaderOnlyFile) TEST_F(CsvReaderTest, ArrowFileSource) { - // auto filepath = temp_env->get_temp_dir() + "ArrowFileSource.csv"; - // { - // std::ofstream outfile(filepath, std::ofstream::out); - // outfile << "A\n9\n8\n7\n6\n5\n4\n3\n2\n"; - // } + auto filepath = temp_env->get_temp_dir() + "ArrowFileSource.csv"; + { + std::ofstream outfile(filepath, std::ofstream::out); + outfile << "A\n9\n8\n7\n6\n5\n4\n3\n2\n"; + } - // std::shared_ptr infile; - // ASSERT_TRUE(arrow::io::ReadableFile::Open(filepath).Value(&infile).ok()); + std::shared_ptr infile; + ASSERT_TRUE(arrow::io::ReadableFile::Open(filepath).Value(&infile).ok()); - // auto arrow_source = cudf_io::arrow_io_source{infile}; - // cudf_io::csv_reader_options in_opts = - // cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source}).dtypes({"int8"}); - // auto result = cudf_io::read_csv(in_opts); + auto arrow_source = cudf_io::arrow_io_source{infile}; + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source}).dtypes({"int8"}); + auto result = cudf_io::read_csv(in_opts); - // const auto view = result.tbl->view(); - // EXPECT_EQ(1, view.num_columns()); - // ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id()); + const auto view = result.tbl->view(); + EXPECT_EQ(1, view.num_columns()); + ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id()); - // expect_column_data_equal(std::vector{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0)); + expect_column_data_equal(std::vector{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0)); } TEST_F(CsvReaderTest, InvalidFloatingPoint) From d07ccfa9f4579e9eefc306e89e7fe8b56866ea2c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 23 Jun 2021 08:14:43 -0700 Subject: [PATCH 38/54] test --- conda/recipes/cudf_kafka/meta.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index 3506d118f07..635fb0f00af 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -29,6 +29,9 @@ requirements: - python - cython >=0.29,<0.30 - setuptools + - pyarrow=4.0.1 + - arrow-cpp=4.0.1 + - arrow-cpp-proc * cuda - cudf {{ version }} - libcudf_kafka {{ version }} run: From bec13ac2245b04de761c59ab769f36272568a17c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 23 Jun 2021 14:55:52 -0700 Subject: [PATCH 39/54] test --- conda/recipes/cudf_kafka/meta.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index 635fb0f00af..3506d118f07 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -29,9 +29,6 @@ requirements: - python - cython >=0.29,<0.30 - setuptools - - pyarrow=4.0.1 - - arrow-cpp=4.0.1 - - arrow-cpp-proc * cuda - cudf {{ version }} - libcudf_kafka {{ version }} run: From 726da201312fdd66e0a28e286ece1e9c09f19edf Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 23 Jun 2021 16:45:06 -0700 Subject: [PATCH 40/54] remove force uninstall of arrow --- ci/cpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 2c6cecf667b..91f7766cffb 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -41,7 +41,7 @@ env gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate rapids -gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda' +# gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda' # gpuci_conda_retry install -y "your-pkg=1.0.0" gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' From 944d32278589164cfc1a873a3ae29146e4664b80 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 23 Jun 2021 18:58:35 -0700 Subject: [PATCH 41/54] change to conda --- ci/cpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 91f7766cffb..72de545147a 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -43,7 +43,7 @@ gpuci_logger "Activate conda env" conda activate rapids # gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda' # gpuci_conda_retry install -y "your-pkg=1.0.0" -gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' +conda install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' # Remove rapidsai-nightly channel if we are building main branch if [ "$SOURCE_BRANCH" = "main" ]; then From d37af801f44fb35aef1c9d921ddc872a9feabcc6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 23 Jun 2021 19:05:39 -0700 Subject: [PATCH 42/54] version --- conda/recipes/libcudf_kafka/meta.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index f1ec813a17f..9d4ca561a45 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -2,6 +2,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} +{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} package: name: libcudf_kafka @@ -25,7 +26,7 @@ requirements: build: - cmake >=3.20.1 host: - - libcudf {{ version }} + - libcudf-{{version}}-cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - librdkafka >=1.5.0,<1.5.3 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not From 0cb9ed4bf3376647c5b4cd027ec992faf83f96b9 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 23 Jun 2021 19:22:14 -0700 Subject: [PATCH 43/54] add build number --- conda/recipes/libcudf_kafka/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index 9d4ca561a45..b813aacec85 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -26,7 +26,7 @@ requirements: build: - cmake >=3.20.1 host: - - libcudf-{{version}}-cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + - libcudf {{version}} {{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - librdkafka >=1.5.0,<1.5.3 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not From 9faa4f402fa24b8abc4efc3f96001d6846cbafa7 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 23 Jun 2021 19:59:31 -0700 Subject: [PATCH 44/54] conda --- conda/recipes/libcudf_kafka/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index b813aacec85..065d388902d 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -26,7 +26,7 @@ requirements: build: - cmake >=3.20.1 host: - - libcudf {{version}} {{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + - libcudf {{version}}-cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - librdkafka >=1.5.0,<1.5.3 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not From 51ef74f5a481334e2e91f1ac5878c3c898ff2eec Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 23 Jun 2021 20:37:19 -0700 Subject: [PATCH 45/54] test --- conda/recipes/libcudf_kafka/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index 065d388902d..dae7b781cdc 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -26,7 +26,7 @@ requirements: build: - cmake >=3.20.1 host: - - libcudf {{version}}-cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + - libcudf {{version}} cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - librdkafka >=1.5.0,<1.5.3 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not From 39f3f2e6dea7d5678db75ff285a77297aa8d0261 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 23 Jun 2021 21:17:01 -0700 Subject: [PATCH 46/54] unpin librdkafka --- conda/recipes/libcudf_kafka/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index dae7b781cdc..7cf72958957 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -27,7 +27,7 @@ requirements: - cmake >=3.20.1 host: - libcudf {{version}} cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - - librdkafka >=1.5.0,<1.5.3 + - librdkafka run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not From b8d33c8c46839736fca354527a4d2333e85ebd24 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 24 Jun 2021 09:17:35 -0700 Subject: [PATCH 47/54] bump librdkafka --- ci/gpu/build.sh | 4 ++-- conda/recipes/libcudf_kafka/meta.yaml | 2 +- python/cudf/cudf/tests/test_cuda_array_interface.py | 3 +++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 22cf6e67827..333c241f615 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -85,7 +85,7 @@ gpuci_conda_retry install -y \ # https://docs.rapids.ai/maintainers/depmgmt/ gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env # gpuci_conda_retry install -y "your-pkg=1.0.0" -gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' +gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' "pyorc" gpuci_logger "Check compiler versions" @@ -218,7 +218,7 @@ fi cd "$WORKSPACE/python/cudf" gpuci_logger "Python py.test for cuDF" -py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term +py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term cd "$WORKSPACE/python/dask_cudf" gpuci_logger "Python py.test for dask-cudf" diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index 7cf72958957..a8ab6811f5a 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -27,7 +27,7 @@ requirements: - cmake >=3.20.1 host: - libcudf {{version}} cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - - librdkafka + - librdkafka >= 1.6.1,<1.7.0a0 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 041dc0076f8..ecf961f133b 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -171,6 +171,9 @@ def test_column_from_ephemeral_cupy_try_lose_reference(): def test_cuda_array_interface_pytorch(): torch = pytest.importorskip("torch") + if not torch.cuda.is_available(): + pytest.skip("need gpu version of pytorch to be installed") + series = cudf.Series([1, -1, 10, -56]) tensor = torch.tensor(series) got = cudf.Series(tensor) From 2d188eee56971bcf72513e65f5acb9af08b2db3d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 24 Jun 2021 10:37:11 -0700 Subject: [PATCH 48/54] refactor --- python/cudf/cudf/testing/_utils.py | 33 ---- python/cudf/cudf/tests/test_binops.py | 16 +- python/cudf/cudf/tests/test_joining.py | 222 ++++++++++++++++++++++++- python/cudf/cudf/tests/test_string.py | 191 --------------------- 4 files changed, 226 insertions(+), 236 deletions(-) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 8d2679dd7aa..672e83e6f64 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -16,8 +16,6 @@ from cudf.core.column.datetime import _numpy_to_pandas_conversion from cudf.utils import dtypes as dtypeutils -_JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi") - supported_numpy_dtypes = [ "bool", "int8", @@ -310,36 +308,5 @@ def does_not_raise(): yield -def assert_join_results_equal(expect, got, how, **kwargs): - if how not in _JOIN_TYPES: - raise ValueError(f"Unrecognized join type {how}") - if how == "right": - got = got[expect.columns] - - if isinstance(expect, (pd.Series, cudf.Series)): - return assert_eq( - expect.sort_values().reset_index(drop=True), - got.sort_values().reset_index(drop=True), - **kwargs, - ) - elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)): - if not len( - expect.columns - ): # can't sort_values() on a df without columns - return assert_eq(expect, got, **kwargs) - - assert_eq( - expect.sort_values(expect.columns.to_list()).reset_index( - drop=True - ), - got.sort_values(got.columns.to_list()).reset_index(drop=True), - **kwargs, - ) - elif isinstance(expect, (pd.Index, cudf.Index)): - return assert_eq(expect.sort_values(), got.sort_values(), **kwargs) - else: - raise ValueError(f"Not a join result: {type(expect).__name__}") - - def xfail_param(param, **kwargs): return pytest.param(param, marks=pytest.mark.xfail(**kwargs)) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index cfd2ea5143f..1c97cbb10ff 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1742,12 +1742,6 @@ def test_binops_with_NA_consistent(dtype, op): assert result._column.null_count == len(data) -def _decimal_series(input, dtype): - return cudf.Series( - [x if x is None else decimal.Decimal(x) for x in input], dtype=dtype, - ) - - @pytest.mark.parametrize( "args", [ @@ -2080,10 +2074,10 @@ def _decimal_series(input, dtype): def test_binops_decimal(args): op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype = args - a = _decimal_series(lhs, l_dtype) - b = _decimal_series(rhs, r_dtype) + a = utils._decimal_series(lhs, l_dtype) + b = utils._decimal_series(rhs, r_dtype) expect = ( - _decimal_series(expect, expect_dtype) + utils._decimal_series(expect, expect_dtype) if isinstance(expect_dtype, cudf.Decimal64Dtype) else cudf.Series(expect, dtype=expect_dtype) ) @@ -2258,7 +2252,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): else: op, ldata, ldtype, rdata, _, expected = args - lhs = _decimal_series(ldata, ldtype) + lhs = utils._decimal_series(ldata, ldtype) rhs = cudf.Series(rdata, dtype=integer_dtype) if reflected: @@ -2746,7 +2740,7 @@ def test_binops_decimal_scalar_compare(args, reflected): else: op, ldata, ldtype, rdata, _, expected = args - lhs = _decimal_series(ldata, ldtype) + lhs = utils._decimal_series(ldata, ldtype) rhs = rdata if reflected: diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 73b71bee77d..7b56f864272 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -12,7 +12,6 @@ NUMERIC_TYPES, assert_eq, assert_exceptions_equal, - assert_join_results_equal, ) _JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi") @@ -72,6 +71,37 @@ def pd_odd_joins(left, right, join_type): return left[left.index.isin(right.index)][left.columns] +def assert_join_results_equal(expect, got, how, **kwargs): + if how not in _JOIN_TYPES: + raise ValueError(f"Unrecognized join type {how}") + if how == "right": + got = got[expect.columns] + + if isinstance(expect, (pd.Series, cudf.Series)): + return assert_eq( + expect.sort_values().reset_index(drop=True), + got.sort_values().reset_index(drop=True), + **kwargs, + ) + elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)): + if not len( + expect.columns + ): # can't sort_values() on a df without columns + return assert_eq(expect, got, **kwargs) + + assert_eq( + expect.sort_values(expect.columns.to_list()).reset_index( + drop=True + ), + got.sort_values(got.columns.to_list()).reset_index(drop=True), + **kwargs, + ) + elif isinstance(expect, (pd.Index, cudf.Index)): + return assert_eq(expect.sort_values(), got.sort_values(), **kwargs) + else: + raise ValueError(f"Not a join result: {type(expect).__name__}") + + @pytest.mark.parametrize("aa,bb,how,method", make_params()) def test_dataframe_join_how(aa, bb, how, method): df = cudf.DataFrame() @@ -1892,3 +1922,193 @@ def test_join_merge_invalid_keys(on, how): with pytest.raises(KeyError): pd_left.merge(pd_right, on=on) gd_left.merge(gd_right, on=on) + + +@pytest.mark.parametrize( + "str_data", + [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]], +) +@pytest.mark.parametrize("num_keys", [1, 2, 3]) +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +def test_string_join_key(str_data, num_keys, how): + other_data = [1, 2, 3, 4, 5][: len(str_data)] + + pdf = pd.DataFrame() + gdf = cudf.DataFrame() + for i in range(num_keys): + pdf[i] = pd.Series(str_data, dtype="str") + gdf[i] = cudf.Series(str_data, dtype="str") + pdf["a"] = other_data + gdf["a"] = other_data + + pdf2 = pdf.copy() + gdf2 = gdf.copy() + + expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how) + got = gdf.merge(gdf2, on=list(range(num_keys)), how=how) + + if len(expect) == 0 and len(got) == 0: + expect = expect.reset_index(drop=True) + got = got[expect.columns] # reorder columns + + if how == "right": + got = got[expect.columns] # reorder columns + + assert_join_results_equal(expect, got, how=how) + + +@pytest.mark.parametrize( + "str_data_nulls", + [ + ["a", "b", "c"], + ["a", "b", "f", "g"], + ["f", "g", "h", "i", "j"], + ["f", "g", "h"], + [None, None, None, None, None], + [], + ], +) +def test_string_join_key_nulls(str_data_nulls): + str_data = ["a", "b", "c", "d", "e"] + other_data = [1, 2, 3, 4, 5] + + other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] + + pdf = pd.DataFrame() + gdf = cudf.DataFrame() + pdf["key"] = pd.Series(str_data, dtype="str") + gdf["key"] = cudf.Series(str_data, dtype="str") + pdf["vals"] = other_data + gdf["vals"] = other_data + + pdf2 = pd.DataFrame() + gdf2 = cudf.DataFrame() + pdf2["key"] = pd.Series(str_data_nulls, dtype="str") + gdf2["key"] = cudf.Series(str_data_nulls, dtype="str") + pdf2["vals"] = pd.Series(other_data_nulls, dtype="int64") + gdf2["vals"] = cudf.Series(other_data_nulls, dtype="int64") + + expect = pdf.merge(pdf2, on="key", how="left") + got = gdf.merge(gdf2, on="key", how="left") + got["vals_y"] = got["vals_y"].fillna(-1) + + if len(expect) == 0 and len(got) == 0: + expect = expect.reset_index(drop=True) + got = got[expect.columns] + + expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64") + + assert_join_results_equal(expect, got, how="left") + + +@pytest.mark.parametrize( + "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] +) +@pytest.mark.parametrize("num_cols", [1, 2, 3]) +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +def test_string_join_non_key(str_data, num_cols, how): + other_data = [1, 2, 3, 4, 5][: len(str_data)] + + pdf = pd.DataFrame() + gdf = cudf.DataFrame() + for i in range(num_cols): + pdf[i] = pd.Series(str_data, dtype="str") + gdf[i] = cudf.Series(str_data, dtype="str") + pdf["a"] = other_data + gdf["a"] = other_data + + pdf2 = pdf.copy() + gdf2 = gdf.copy() + + expect = pdf.merge(pdf2, on=["a"], how=how) + got = gdf.merge(gdf2, on=["a"], how=how) + + if len(expect) == 0 and len(got) == 0: + expect = expect.reset_index(drop=True) + got = got[expect.columns] + + if how == "right": + got = got[expect.columns] # reorder columns + + assert_join_results_equal(expect, got, how=how) + + +@pytest.mark.parametrize( + "str_data_nulls", + [ + ["a", "b", "c"], + ["a", "b", "f", "g"], + ["f", "g", "h", "i", "j"], + ["f", "g", "h"], + [None, None, None, None, None], + [], + ], +) +def test_string_join_non_key_nulls(str_data_nulls): + str_data = ["a", "b", "c", "d", "e"] + other_data = [1, 2, 3, 4, 5] + + other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] + + pdf = pd.DataFrame() + gdf = cudf.DataFrame() + pdf["vals"] = pd.Series(str_data, dtype="str") + gdf["vals"] = cudf.Series(str_data, dtype="str") + pdf["key"] = other_data + gdf["key"] = other_data + + pdf2 = pd.DataFrame() + gdf2 = cudf.DataFrame() + pdf2["vals"] = pd.Series(str_data_nulls, dtype="str") + gdf2["vals"] = cudf.Series(str_data_nulls, dtype="str") + pdf2["key"] = pd.Series(other_data_nulls, dtype="int64") + gdf2["key"] = cudf.Series(other_data_nulls, dtype="int64") + + expect = pdf.merge(pdf2, on="key", how="left") + got = gdf.merge(gdf2, on="key", how="left") + + if len(expect) == 0 and len(got) == 0: + expect = expect.reset_index(drop=True) + got = got[expect.columns] + + assert_join_results_equal(expect, got, how="left") + + +def test_string_join_values_nulls(): + left_dict = [ + {"b": "MATCH 1", "a": 1.0}, + {"b": "MATCH 1", "a": 1.0}, + {"b": "LEFT NO MATCH 1", "a": -1.0}, + {"b": "MATCH 2", "a": 2.0}, + {"b": "MATCH 2", "a": 2.0}, + {"b": "MATCH 1", "a": 1.0}, + {"b": "MATCH 1", "a": 1.0}, + {"b": "MATCH 2", "a": 2.0}, + {"b": "MATCH 2", "a": 2.0}, + {"b": "LEFT NO MATCH 2", "a": -2.0}, + {"b": "MATCH 3", "a": 3.0}, + {"b": "MATCH 3", "a": 3.0}, + ] + + right_dict = [ + {"b": "RIGHT NO MATCH 1", "c": -1.0}, + {"b": "MATCH 3", "c": 3.0}, + {"b": "MATCH 2", "c": 2.0}, + {"b": "RIGHT NO MATCH 2", "c": -2.0}, + {"b": "RIGHT NO MATCH 3", "c": -3.0}, + {"b": "MATCH 1", "c": 1.0}, + ] + + left_pdf = pd.DataFrame(left_dict) + right_pdf = pd.DataFrame(right_dict) + + left_gdf = cudf.DataFrame.from_pandas(left_pdf) + right_gdf = cudf.DataFrame.from_pandas(right_pdf) + + expect = left_pdf.merge(right_pdf, how="left", on="b") + got = left_gdf.merge(right_gdf, how="left", on="b") + + expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True) + got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True) + + assert_join_results_equal(expect, got, how="left") diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 8567c479e1c..3c153a16a13 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -23,7 +23,6 @@ NUMERIC_TYPES, assert_eq, assert_exceptions_equal, - assert_join_results_equal, ) from cudf.utils import dtypes as dtypeutils @@ -919,196 +918,6 @@ def test_string_split(data, pat, n, expand): assert_eq(expect, got) -@pytest.mark.parametrize( - "str_data", - [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]], -) -@pytest.mark.parametrize("num_keys", [1, 2, 3]) -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -def test_string_join_key(str_data, num_keys, how): - other_data = [1, 2, 3, 4, 5][: len(str_data)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - for i in range(num_keys): - pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = cudf.Series(str_data, dtype="str") - pdf["a"] = other_data - gdf["a"] = other_data - - pdf2 = pdf.copy() - gdf2 = gdf.copy() - - expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how) - got = gdf.merge(gdf2, on=list(range(num_keys)), how=how) - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] # reorder columns - - if how == "right": - got = got[expect.columns] # reorder columns - - assert_join_results_equal(expect, got, how=how) - - -@pytest.mark.parametrize( - "str_data_nulls", - [ - ["a", "b", "c"], - ["a", "b", "f", "g"], - ["f", "g", "h", "i", "j"], - ["f", "g", "h"], - [None, None, None, None, None], - [], - ], -) -def test_string_join_key_nulls(str_data_nulls): - str_data = ["a", "b", "c", "d", "e"] - other_data = [1, 2, 3, 4, 5] - - other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - pdf["key"] = pd.Series(str_data, dtype="str") - gdf["key"] = cudf.Series(str_data, dtype="str") - pdf["vals"] = other_data - gdf["vals"] = other_data - - pdf2 = pd.DataFrame() - gdf2 = cudf.DataFrame() - pdf2["key"] = pd.Series(str_data_nulls, dtype="str") - gdf2["key"] = cudf.Series(str_data_nulls, dtype="str") - pdf2["vals"] = pd.Series(other_data_nulls, dtype="int64") - gdf2["vals"] = cudf.Series(other_data_nulls, dtype="int64") - - expect = pdf.merge(pdf2, on="key", how="left") - got = gdf.merge(gdf2, on="key", how="left") - got["vals_y"] = got["vals_y"].fillna(-1) - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] - - expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64") - - assert_join_results_equal(expect, got, how="left") - - -@pytest.mark.parametrize( - "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] -) -@pytest.mark.parametrize("num_cols", [1, 2, 3]) -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -def test_string_join_non_key(str_data, num_cols, how): - other_data = [1, 2, 3, 4, 5][: len(str_data)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - for i in range(num_cols): - pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = cudf.Series(str_data, dtype="str") - pdf["a"] = other_data - gdf["a"] = other_data - - pdf2 = pdf.copy() - gdf2 = gdf.copy() - - expect = pdf.merge(pdf2, on=["a"], how=how) - got = gdf.merge(gdf2, on=["a"], how=how) - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] - - if how == "right": - got = got[expect.columns] # reorder columns - - assert_join_results_equal(expect, got, how=how) - - -@pytest.mark.parametrize( - "str_data_nulls", - [ - ["a", "b", "c"], - ["a", "b", "f", "g"], - ["f", "g", "h", "i", "j"], - ["f", "g", "h"], - [None, None, None, None, None], - [], - ], -) -def test_string_join_non_key_nulls(str_data_nulls): - str_data = ["a", "b", "c", "d", "e"] - other_data = [1, 2, 3, 4, 5] - - other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - pdf["vals"] = pd.Series(str_data, dtype="str") - gdf["vals"] = cudf.Series(str_data, dtype="str") - pdf["key"] = other_data - gdf["key"] = other_data - - pdf2 = pd.DataFrame() - gdf2 = cudf.DataFrame() - pdf2["vals"] = pd.Series(str_data_nulls, dtype="str") - gdf2["vals"] = cudf.Series(str_data_nulls, dtype="str") - pdf2["key"] = pd.Series(other_data_nulls, dtype="int64") - gdf2["key"] = cudf.Series(other_data_nulls, dtype="int64") - - expect = pdf.merge(pdf2, on="key", how="left") - got = gdf.merge(gdf2, on="key", how="left") - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] - - assert_join_results_equal(expect, got, how="left") - - -def test_string_join_values_nulls(): - left_dict = [ - {"b": "MATCH 1", "a": 1.0}, - {"b": "MATCH 1", "a": 1.0}, - {"b": "LEFT NO MATCH 1", "a": -1.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "MATCH 1", "a": 1.0}, - {"b": "MATCH 1", "a": 1.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "LEFT NO MATCH 2", "a": -2.0}, - {"b": "MATCH 3", "a": 3.0}, - {"b": "MATCH 3", "a": 3.0}, - ] - - right_dict = [ - {"b": "RIGHT NO MATCH 1", "c": -1.0}, - {"b": "MATCH 3", "c": 3.0}, - {"b": "MATCH 2", "c": 2.0}, - {"b": "RIGHT NO MATCH 2", "c": -2.0}, - {"b": "RIGHT NO MATCH 3", "c": -3.0}, - {"b": "MATCH 1", "c": 1.0}, - ] - - left_pdf = pd.DataFrame(left_dict) - right_pdf = pd.DataFrame(right_dict) - - left_gdf = cudf.DataFrame.from_pandas(left_pdf) - right_gdf = cudf.DataFrame.from_pandas(right_pdf) - - expect = left_pdf.merge(right_pdf, how="left", on="b") - got = left_gdf.merge(right_gdf, how="left", on="b") - - expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True) - got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True) - - assert_join_results_equal(expect, got, how="left") - - @pytest.mark.parametrize( "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] ) From 6405634a111e1b44600f079191d2eba3c77f6715 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 24 Jun 2021 11:13:43 -0700 Subject: [PATCH 49/54] pin --- conda/recipes/libcudf_kafka/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index a8ab6811f5a..ee86564bf2b 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -27,7 +27,7 @@ requirements: - cmake >=3.20.1 host: - libcudf {{version}} cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - - librdkafka >= 1.6.1,<1.7.0a0 + - librdkafka >=1.6.0,<1.7.0a0 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not From be35c9c125c4c7e3c081dcaaa4a21a4e24f1d52b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 24 Jun 2021 15:28:14 -0500 Subject: [PATCH 50/54] Update python/cudf/cudf/_lib/utils.pyx Co-authored-by: Ram (Ramakrishna Prabhu) <42624703+rgsl888prabhu@users.noreply.github.com> --- python/cudf/cudf/_lib/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 449d01357b0..e5dfb5a5c35 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -140,7 +140,7 @@ cpdef generate_pandas_metadata(Table table, index): metadata = pa.pandas_compat.construct_metadata( columns_to_convert=[ col - for col in table._data.columns + for col in table._columns ], df=table, column_names=col_names, From ff945bb20168a9e09bb8c19d06f306463b104d27 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 24 Jun 2021 21:01:19 -0700 Subject: [PATCH 51/54] add confluent-kafka --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 333c241f615..12ee13433cf 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -85,7 +85,7 @@ gpuci_conda_retry install -y \ # https://docs.rapids.ai/maintainers/depmgmt/ gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env # gpuci_conda_retry install -y "your-pkg=1.0.0" -gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' "pyorc" +gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' "pyorc" "python-confluent-kafka>=1.3.0" gpuci_logger "Check compiler versions" From db56768f19b379ed94cf69a0d1b6ad6758d7734f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 25 Jun 2021 12:01:22 -0700 Subject: [PATCH 52/54] remove pin --- conda/recipes/libcudf_kafka/meta.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index ee86564bf2b..6b15890e7c7 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -2,7 +2,6 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} package: name: libcudf_kafka @@ -26,7 +25,7 @@ requirements: build: - cmake >=3.20.1 host: - - libcudf {{version}} cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + - libcudf {{version}} - librdkafka >=1.6.0,<1.7.0a0 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not From 915c5dff78fbe4485829279d2214c14f705bc79a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 25 Jun 2021 14:42:04 -0700 Subject: [PATCH 53/54] fix typo --- python/cudf/cudf/tests/test_list.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index f0fc07ebe7e..a6a9ba97ef5 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -9,7 +9,7 @@ import cudf from cudf import NA from cudf._lib.copying import get_element -from cudf.tests._utils import ( +from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES, From e9515ba6e53fb91dd3f16f296d40c4342595beab Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 28 Jun 2021 07:00:38 -0700 Subject: [PATCH 54/54] revert local arrow installs --- ci/cpu/build.sh | 3 --- ci/gpu/build.sh | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 72de545147a..e11a0488624 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -41,9 +41,6 @@ env gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate rapids -# gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda' -# gpuci_conda_retry install -y "your-pkg=1.0.0" -conda install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' # Remove rapidsai-nightly channel if we are building main branch if [ "$SOURCE_BRANCH" = "main" ]; then diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 12ee13433cf..c854e67fbdf 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -83,9 +83,8 @@ gpuci_conda_retry install -y \ "ucx-py=0.21.*" # https://docs.rapids.ai/maintainers/depmgmt/ -gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env +# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env # gpuci_conda_retry install -y "your-pkg=1.0.0" -gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' "pyorc" "python-confluent-kafka>=1.3.0" gpuci_logger "Check compiler versions"