From 39528541e6bc63c85049194ed3db6016522b8ca7 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 2 Mar 2021 14:45:58 -0800
Subject: [PATCH 01/54] update pyarrow to 3.0.0

---
 conda/environments/cudf_dev_cuda10.1.yml    |  4 +--
 conda/environments/cudf_dev_cuda10.2.yml    |  4 +--
 conda/environments/cudf_dev_cuda11.0.yml    |  4 +--
 conda/recipes/cudf/meta.yaml                |  2 +-
 conda/recipes/libcudf/meta.yaml             |  2 +-
 cpp/cmake/thirdparty/CUDF_GetArrow.cmake    |  2 +-
 python/cudf/cudf/_lib/gpuarrow.pyx          |  4 +--
 python/cudf/cudf/_lib/transpose.pyx         |  6 ++--
 python/cudf/cudf/_lib/utils.pyx             | 13 ++++----
 python/cudf/cudf/core/column/categorical.py | 36 +++++++++------------
 python/cudf/cudf/core/column/column.py      | 16 ++++-----
 python/cudf/cudf/core/dataframe.py          | 11 ++++---
 python/cudf/cudf/core/frame.py              |  7 ++--
 python/cudf/cudf/core/index.py              | 14 ++++----
 python/cudf/cudf/io/csv.py                  |  2 +-
 python/cudf/cudf/testing/testing.py         |  4 +--
 python/cudf/cudf/tests/__init__.py          |  0
 python/cudf/cudf/tests/test_dataframe.py    |  4 +--
 18 files changed, 63 insertions(+), 72 deletions(-)
 delete mode 100644 python/cudf/cudf/tests/__init__.py

diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
index 3541ed1208c..e9ae53f8789 100644
--- a/conda/environments/cudf_dev_cuda10.1.yml
+++ b/conda/environments/cudf_dev_cuda10.1.yml
@@ -18,7 +18,7 @@ dependencies:
   - numba>=0.49.0,!=0.51.0
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=1.0.1
+  - pyarrow=3.0.0
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -47,7 +47,7 @@ dependencies:
   - distributed>=2.22.0
   - streamz
   - dlpack
-  - arrow-cpp=1.0.1
+  - arrow-cpp=3.0.0
   - arrow-cpp-proc * cuda
   - boost-cpp>=1.72.0
   - double-conversion
diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
index 839533516fb..22e2d22378e 100644
--- a/conda/environments/cudf_dev_cuda10.2.yml
+++ b/conda/environments/cudf_dev_cuda10.2.yml
@@ -18,7 +18,7 @@ dependencies:
   - numba>=0.49,!=0.51.0
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=1.0.1
+  - pyarrow=3.0.0
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -47,7 +47,7 @@ dependencies:
   - distributed>=2.22.0
   - streamz
   - dlpack
-  - arrow-cpp=1.0.1
+  - arrow-cpp=3.0.0
   - arrow-cpp-proc * cuda
   - boost-cpp>=1.72.0
   - double-conversion
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 401eaea63da..38ef346778c 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -18,7 +18,7 @@ dependencies:
   - numba>=0.49,!=0.51.0
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=1.0.1
+  - pyarrow=3.0.0
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -47,7 +47,7 @@ dependencies:
   - distributed>=2.22.0
   - streamz
   - dlpack
-  - arrow-cpp=1.0.1
+  - arrow-cpp=3.0.0
   - arrow-cpp-proc * cuda
   - boost-cpp>=1.72.0
   - double-conversion
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 5635f54ba20..f8ecb711d9b 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -27,7 +27,7 @@ requirements:
     - setuptools
     - numba >=0.49.0
     - dlpack
-    - pyarrow 1.0.1
+    - pyarrow 3.0.0
     - libcudf {{ version }}
     - rmm {{ minor_version }}
     - cudatoolkit {{ cuda_version }}
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 885a22870bb..4490a3547e0 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -37,7 +37,7 @@ requirements:
   host:
     - librmm {{ minor_version }}.*
     - cudatoolkit {{ cuda_version }}.*
-    - arrow-cpp 1.0.1
+    - arrow-cpp 3.0.0
     - arrow-cpp-proc * cuda
     - boost-cpp 1.72.0
     - dlpack
diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
index 425a9af897d..f9ddbc80583 100644
--- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
@@ -118,6 +118,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC)
 
 endfunction()
 
-set(CUDF_VERSION_Arrow 1.0.1)
+set(CUDF_VERSION_Arrow 3.0.0)
 
 find_and_configure_arrow(${CUDF_VERSION_Arrow} ${ARROW_STATIC_LIB})
diff --git a/python/cudf/cudf/_lib/gpuarrow.pyx b/python/cudf/cudf/_lib/gpuarrow.pyx
index 6513cd59424..a7da22637b9 100644
--- a/python/cudf/cudf/_lib/gpuarrow.pyx
+++ b/python/cudf/cudf/_lib/gpuarrow.pyx
@@ -15,7 +15,7 @@ from pyarrow.includes.libarrow cimport (
     CRecordBatchStreamReader
 )
 from pyarrow.lib cimport (
-    _CRecordBatchReader,
+    RecordBatchReader,
     Buffer,
     Schema,
     pyarrow_wrap_schema
@@ -23,7 +23,7 @@ from pyarrow.lib cimport (
 import pyarrow as pa
 
 
-cdef class CudaRecordBatchStreamReader(_CRecordBatchReader):
+cdef class CudaRecordBatchStreamReader(RecordBatchReader):
     cdef:
         CIpcReadOptions options
 
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index d2b053789cd..fd02a08c49e 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -36,11 +36,11 @@ def transpose(Table source):
     if is_categorical_dtype(dtype):
         if any(not is_categorical_dtype(c.dtype) for c in source._columns):
             raise ValueError('Columns must all have the same dtype')
-        cats = list(c.cat().categories for c in source._columns)
+        cats = list(c.cat.categories for c in source._columns)
         cats = cudf.Series(cudf.concat(cats)).drop_duplicates()._column
         source = Table(index=source._index, data=[
-            (name, col.cat()._set_categories(
-                col.cat().categories, cats, is_unique=True).codes)
+            (name, col.cat._set_categories(
+                col.cat.categories, cats, is_unique=True).codes)
             for name, col in source._data.items()
         ])
     elif dtype.kind in 'OU':
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 4c4ef17c6b9..26101c8bf7f 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -116,12 +116,13 @@ cpdef generate_pandas_metadata(Table table, index):
             index_descriptors.append(descr)
 
     metadata = pa.pandas_compat.construct_metadata(
-        table,
-        col_names,
-        index_levels,
-        index_descriptors,
-        index,
-        types,
+        columns_to_convert=table._data.columns,
+        df=table,
+        column_names=col_names,
+        index_levels=index_levels,
+        index_descriptors=index_descriptors,
+        preserve_index=index,
+        types=types,
     )
 
     md_dict = json.loads(metadata[b"pandas"])
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index c41a458f02b..7f8f1e7228f 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -936,8 +936,9 @@ def ordered(self) -> Optional[bool]:
     def ordered(self, value: bool):
         self.dtype.ordered = value
 
-    def cat(self, parent: ParentType = None):
-        return CategoricalAccessor(self, parent=parent)
+    @property
+    def cat(self):
+        return CategoricalAccessor(self, parent=None)
 
     def unary_operator(self, unaryop: str):
         raise TypeError(
@@ -1085,7 +1086,7 @@ def to_pandas(
             col = self
 
         signed_dtype = min_signed_type(len(col.categories))
-        codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array()
+        codes = col.cat.codes.astype(signed_dtype).fillna(-1).to_array()
         categories = col.categories.dropna(drop_nan=True).to_pandas()
         data = pd.Categorical.from_codes(
             codes, categories=categories, ordered=col.ordered
@@ -1198,13 +1199,11 @@ def find_and_replace(
         # named 'index', which came from the filtered categories,
         # contains the new ints that we need to map to
         to_replace_col = column.as_column(catmap.index).astype(
-            self.cat().codes.dtype
-        )
-        replacement_col = catmap["index"]._column.astype(
-            self.cat().codes.dtype
+            self.cat.codes.dtype
         )
+        replacement_col = catmap["index"]._column.astype(self.cat.codes.dtype)
 
-        replaced = column.as_column(self.cat().codes)
+        replaced = column.as_column(self.cat.codes)
         output = libcudf.replace.replace(
             replaced, to_replace_col, replacement_col
         )
@@ -1282,10 +1281,8 @@ def fillna(
                         )
                 # TODO: only required if fill_value has a subset of the
                 # categories:
-                fill_value = fill_value.cat()._set_categories(
-                    fill_value.cat().categories,
-                    self.categories,
-                    is_unique=True,
+                fill_value = fill_value.cat._set_categories(
+                    fill_value.cat.categories, self.categories, is_unique=True,
                 )
                 fill_value = column.as_column(fill_value.codes).astype(
                     self.codes.dtype
@@ -1363,7 +1360,7 @@ def as_categorical_column(
             # return a column full of Nulls.
             return _create_empty_categorical_column(self, dtype)
 
-        return self.cat().set_categories(
+        return self.cat.set_categories(
             new_categories=dtype.categories, ordered=dtype.ordered
         )
 
@@ -1388,8 +1385,8 @@ def as_timedelta_column(self, dtype, **kwargs) -> TimeDeltaColumn:
     def _get_decategorized_column(self) -> ColumnBase:
         if self.null_count == len(self):
             # self.categories is empty; just return codes
-            return self.cat().codes._column
-        gather_map = self.cat().codes.astype("int32").fillna(0)._column
+            return self.cat.codes._column
+        gather_map = self.cat.codes.astype("int32").fillna(0)._column
         out = self.categories.take(gather_map)
         out = out.set_mask(self.mask)
         return out
@@ -1422,9 +1419,7 @@ def copy(self, deep: bool = True) -> CategoricalColumn:
             )
 
     def __sizeof__(self) -> int:
-        return (
-            self.cat().categories.__sizeof__() + self.cat().codes.__sizeof__()
-        )
+        return self.cat.categories.__sizeof__() + self.cat.codes.__sizeof__()
 
     def _memory_usage(self, **kwargs) -> int:
         deep = kwargs.get("deep", False)
@@ -1432,8 +1427,7 @@ def _memory_usage(self, **kwargs) -> int:
             return self.__sizeof__()
         else:
             return (
-                self.categories._memory_usage()
-                + self.cat().codes.memory_usage()
+                self.categories._memory_usage() + self.cat.codes.memory_usage()
             )
 
     def _mimic_inplace(
@@ -1459,7 +1453,7 @@ def _create_empty_categorical_column(
             cudf.utils.utils.scalar_broadcast_to(
                 categorical_column.default_na_value(),
                 categorical_column.size,
-                np.dtype(categorical_column.cat().codes),
+                np.dtype(categorical_column.cat.codes),
             )
         ),
         offset=categorical_column.offset,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1bad2c3a451..dd59e0909a0 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -63,6 +63,7 @@
 from cudf.utils.utils import mask_dtype
 
 T = TypeVar("T", bound="ColumnBase")
+ParentType = Union["cudf.Series", "cudf.Index"]
 
 
 class ColumnBase(Column, Serializable):
@@ -188,9 +189,8 @@ def __sizeof__(self) -> int:
             n += bitmask_allocation_size_bytes(self.size)
         return n
 
-    def cat(
-        self, parent=None
-    ) -> "cudf.core.column.categorical.CategoricalAccessor":
+    @property
+    def cat(self) -> "cudf.core.column.categorical.CategoricalAccessor":
         raise NotImplementedError()
 
     def str(self, parent=None) -> "cudf.core.column.string.StringMethods":
@@ -253,21 +253,19 @@ def _concat(
         if is_categorical:
             # Combine and de-dupe the categories
             cats = (
-                cudf.concat([o.cat().categories for o in objs])
+                cudf.concat([o.cat.categories for o in objs])
                 .to_series()
                 .drop_duplicates(ignore_index=True)
                 ._column
             )
             objs = [
-                o.cat()._set_categories(
-                    o.cat().categories, cats, is_unique=True
-                )
+                o.cat._set_categories(o.cat.categories, cats, is_unique=True)
                 for o in objs
             ]
             # Map `objs` into a list of the codes until we port Categorical to
             # use the libcudf++ Category data type.
-            objs = [o.cat().codes._column for o in objs]
-            head = head.cat().codes._column
+            objs = [o.cat.codes._column for o in objs]
+            head = head.cat.codes._column
 
         newsize = sum(map(len, objs))
         if newsize > libcudf.MAX_COLUMN_SIZE:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 18a7f052d62..c0e1b8995aa 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5396,11 +5396,12 @@ def to_arrow(self, preserve_index=True):
 
         out = super(DataFrame, data).to_arrow()
         metadata = pa.pandas_compat.construct_metadata(
-            self,
-            out.schema.names,
-            [self.index],
-            index_descr,
-            preserve_index,
+            columns_to_convert=self._data.columns,
+            df=self,
+            column_names=out.schema.names,
+            index_levels=[self.index],
+            index_descriptors=index_descr,
+            preserve_index=preserve_index,
             types=out.schema.types,
         )
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 926aad368b0..f413537bbeb 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -3792,7 +3792,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
         ):
             # Combine and de-dupe the categories
             categories[idx] = (
-                cudf.concat([col.cat().categories for col in cols])
+                cudf.concat([col.cat.categories for col in cols])
                 .to_series()
                 .drop_duplicates(ignore_index=True)
                 ._column
@@ -3823,9 +3823,8 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories):
                 if idx in categories:
                     cols[idx] = (
                         cols[idx]
-                        .cat()
-                        ._set_categories(
-                            cols[idx].cat().categories,
+                        .cat._set_categories(
+                            cols[idx].cat.categories,
                             categories[idx],
                             is_unique=True,
                         )
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 8c86352b2a7..b605d51af90 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2672,17 +2672,15 @@ def __new__(
             dtype = None
 
         if categories is not None:
-            data.cat().set_categories(
-                categories, ordered=ordered, inplace=True
-            )
+            data.cat.set_categories(categories, ordered=ordered, inplace=True)
         elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)):
-            data.cat().set_categories(
+            data.cat.set_categories(
                 dtype.categories, ordered=ordered, inplace=True
             )
         elif ordered is True and data.ordered is False:
-            data.cat().as_ordered(inplace=True)
+            data.cat.as_ordered(inplace=True)
         elif ordered is False and data.ordered is True:
-            data.cat().as_unordered(inplace=True)
+            data.cat.as_unordered(inplace=True)
 
         out._initialize(data, **kwargs)
 
@@ -2693,14 +2691,14 @@ def codes(self):
         """
         The category codes of this categorical.
         """
-        return self._values.cat().codes
+        return self._values.cat.codes
 
     @property
     def categories(self):
         """
         The categories of this categorical.
         """
-        return self._values.cat().categories
+        return self._values.cat.categories
 
 
 class StringIndex(GenericIndex):
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index e2c7ca7dca1..f9b60cfce0b 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -177,7 +177,7 @@ def to_csv(
         df = df.copy(deep=False)
         for col_name, col in df._data.items():
             if isinstance(col, cudf.core.column.CategoricalColumn):
-                df._data[col_name] = col.astype(col.cat().categories.dtype)
+                df._data[col_name] = col.astype(col.cat.categories.dtype)
 
         if isinstance(df.index, cudf.CategoricalIndex):
             df.index = df.index.astype(df.index.categories.dtype)
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index bacab24a6f3..9930327d89d 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -162,8 +162,8 @@ def assert_column_equal(
 
     if check_exact and check_categorical:
         if is_categorical_dtype(left) and is_categorical_dtype(right):
-            left_cat = left.cat().categories
-            right_cat = right.cat().categories
+            left_cat = left.cat.categories
+            right_cat = right.cat.categories
 
             if check_category_order:
                 assert_index_equal(
diff --git a/python/cudf/cudf/tests/__init__.py b/python/cudf/cudf/tests/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 69f6ecfeb17..b04d2ca3da8 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5149,8 +5149,8 @@ def test_memory_usage_cat():
     gdf = cudf.from_pandas(df)
 
     expected = (
-        gdf.B._column.cat().categories.__sizeof__()
-        + gdf.B._column.cat().codes.__sizeof__()
+        gdf.B._column.cat.categories.__sizeof__()
+        + gdf.B._column.cat.codes.__sizeof__()
     )
 
     # Check cat column

From 74603fbc5e78918042397b76d5464250a2265afc Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 3 Mar 2021 14:11:49 -0600
Subject: [PATCH 02/54] move utility code out of tests folder

---
 python/cudf/cudf/_fuzz_testing/avro.py                    | 2 +-
 python/cudf/cudf/_fuzz_testing/csv.py                     | 2 +-
 python/cudf/cudf/_fuzz_testing/json.py                    | 2 +-
 python/cudf/cudf/_fuzz_testing/orc.py                     | 2 +-
 python/cudf/cudf/_fuzz_testing/parquet.py                 | 2 +-
 python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py     | 2 +-
 python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py    | 2 +-
 python/cudf/cudf/_fuzz_testing/utils.py                   | 2 +-
 python/cudf/cudf/{tests => testing}/dataset_generator.py  | 0
 python/cudf/cudf/{tests => testing}/utils.py              | 0
 python/cudf/cudf/tests/test_apply_rows.py                 | 2 +-
 python/cudf/cudf/tests/test_applymap.py                   | 2 +-
 python/cudf/cudf/tests/test_array_function.py             | 2 +-
 python/cudf/cudf/tests/test_array_ufunc.py                | 7 ++++---
 .../cudf/tests/test_avro_reader_fastavro_integration.py   | 2 +-
 python/cudf/cudf/tests/test_binops.py                     | 2 +-
 python/cudf/cudf/tests/test_categorical.py                | 2 +-
 python/cudf/cudf/tests/test_column.py                     | 8 ++++----
 python/cudf/cudf/tests/test_column_accessor.py            | 2 +-
 python/cudf/cudf/tests/test_concat.py                     | 2 +-
 python/cudf/cudf/tests/test_contains.py                   | 2 +-
 python/cudf/cudf/tests/test_copying.py                    | 2 +-
 python/cudf/cudf/tests/test_csv.py                        | 2 +-
 python/cudf/cudf/tests/test_cuda_apply.py                 | 2 +-
 python/cudf/cudf/tests/test_cuda_array_interface.py       | 2 +-
 python/cudf/cudf/tests/test_custom_accessor.py            | 4 ++--
 python/cudf/cudf/tests/test_dataframe.py                  | 4 ++--
 python/cudf/cudf/tests/test_dataframe_copy.py             | 2 +-
 python/cudf/cudf/tests/test_datasets.py                   | 2 +-
 python/cudf/cudf/tests/test_datetime.py                   | 2 +-
 python/cudf/cudf/tests/test_dlpack.py                     | 2 +-
 python/cudf/cudf/tests/test_dropna.py                     | 2 +-
 python/cudf/cudf/tests/test_dtypes.py                     | 4 ++--
 python/cudf/cudf/tests/test_duplicates.py                 | 2 +-
 python/cudf/cudf/tests/test_factorize.py                  | 2 +-
 python/cudf/cudf/tests/test_feather.py                    | 2 +-
 python/cudf/cudf/tests/test_fill.py                       | 2 +-
 python/cudf/cudf/tests/test_gcs.py                        | 2 +-
 python/cudf/cudf/tests/test_gpu_arrow_parser.py           | 2 +-
 python/cudf/cudf/tests/test_groupby.py                    | 2 +-
 python/cudf/cudf/tests/test_hdf.py                        | 2 +-
 python/cudf/cudf/tests/test_hdfs.py                       | 2 +-
 python/cudf/cudf/tests/test_index.py                      | 2 +-
 python/cudf/cudf/tests/test_indexing.py                   | 8 ++++++--
 python/cudf/cudf/tests/test_interval.py                   | 2 +-
 python/cudf/cudf/tests/test_joining.py                    | 2 +-
 python/cudf/cudf/tests/test_json.py                       | 2 +-
 python/cudf/cudf/tests/test_list.py                       | 2 +-
 python/cudf/cudf/tests/test_monotonic.py                  | 2 +-
 python/cudf/cudf/tests/test_multiindex.py                 | 2 +-
 python/cudf/cudf/tests/test_numerical.py                  | 2 +-
 python/cudf/cudf/tests/test_numpy_interop.py              | 2 +-
 python/cudf/cudf/tests/test_onehot.py                     | 2 +-
 python/cudf/cudf/tests/test_ops.py                        | 2 +-
 python/cudf/cudf/tests/test_orc.py                        | 6 +++++-
 python/cudf/cudf/tests/test_pandas_interop.py             | 2 +-
 python/cudf/cudf/tests/test_parquet.py                    | 4 ++--
 python/cudf/cudf/tests/test_pickling.py                   | 2 +-
 python/cudf/cudf/tests/test_quantiles.py                  | 2 +-
 python/cudf/cudf/tests/test_query.py                      | 2 +-
 python/cudf/cudf/tests/test_query_mask.py                 | 2 +-
 python/cudf/cudf/tests/test_rank.py                       | 2 +-
 python/cudf/cudf/tests/test_reductions.py                 | 4 ++--
 python/cudf/cudf/tests/test_replace.py                    | 2 +-
 python/cudf/cudf/tests/test_repr.py                       | 2 +-
 python/cudf/cudf/tests/test_reshape.py                    | 2 +-
 python/cudf/cudf/tests/test_rolling.py                    | 2 +-
 python/cudf/cudf/tests/test_s3.py                         | 2 +-
 python/cudf/cudf/tests/test_scalar.py                     | 2 +-
 python/cudf/cudf/tests/test_scan.py                       | 7 ++++++-
 python/cudf/cudf/tests/test_search.py                     | 2 +-
 python/cudf/cudf/tests/test_serialize.py                  | 4 ++--
 python/cudf/cudf/tests/test_series.py                     | 2 +-
 python/cudf/cudf/tests/test_seriesmap.py                  | 4 ++--
 python/cudf/cudf/tests/test_setitem.py                    | 2 +-
 python/cudf/cudf/tests/test_sorting.py                    | 2 +-
 python/cudf/cudf/tests/test_sparse_df.py                  | 2 +-
 python/cudf/cudf/tests/test_stats.py                      | 2 +-
 python/cudf/cudf/tests/test_string.py                     | 2 +-
 python/cudf/cudf/tests/test_struct.py                     | 2 +-
 python/cudf/cudf/tests/test_testing.py                    | 2 +-
 python/cudf/cudf/tests/test_text.py                       | 2 +-
 python/cudf/cudf/tests/test_timedelta.py                  | 4 ++--
 python/cudf/cudf/tests/test_transform.py                  | 2 +-
 python/cudf/cudf/tests/test_unaops.py                     | 2 +-
 python/custreamz/custreamz/tests/test_kafka.py            | 2 +-
 python/dask_cudf/dask_cudf/tests/test_accessor.py         | 2 +-
 python/dask_cudf/dask_cudf/tests/test_core.py             | 6 +++---
 python/dask_cudf/dask_cudf/tests/test_distributed.py      | 2 +-
 89 files changed, 117 insertions(+), 103 deletions(-)
 rename python/cudf/cudf/{tests => testing}/dataset_generator.py (100%)
 rename python/cudf/cudf/{tests => testing}/utils.py (100%)

diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py
index a07e3acf416..4c167ac627f 100644
--- a/python/cudf/cudf/_fuzz_testing/avro.py
+++ b/python/cudf/cudf/_fuzz_testing/avro.py
@@ -15,7 +15,7 @@
     pandas_to_avro,
     pyarrow_to_pandas,
 )
-from cudf.tests import dataset_generator as dg
+from cudf.testing import dataset_generator as dg
 
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py
index 84346ed61ad..0acb9c8a471 100644
--- a/python/cudf/cudf/_fuzz_testing/csv.py
+++ b/python/cudf/cudf/_fuzz_testing/csv.py
@@ -12,7 +12,7 @@
     _generate_rand_meta,
     pyarrow_to_pandas,
 )
-from cudf.tests import dataset_generator as dg
+from cudf.testing import dataset_generator as dg
 from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes
 
 logging.basicConfig(
diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py
index 5ecb27f7665..df9226cf059 100644
--- a/python/cudf/cudf/_fuzz_testing/json.py
+++ b/python/cudf/cudf/_fuzz_testing/json.py
@@ -13,7 +13,7 @@
     _generate_rand_meta,
     pyarrow_to_pandas,
 )
-from cudf.tests import dataset_generator as dg
+from cudf.testing import dataset_generator as dg
 from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes
 
 logging.basicConfig(
diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index 607294a49c9..2aa01eb3967 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -16,7 +16,7 @@
     pandas_to_orc,
     pyarrow_to_pandas,
 )
-from cudf.tests import dataset_generator as dg
+from cudf.testing import dataset_generator as dg
 
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py
index 4a9b63cd6aa..ca08e3348c0 100644
--- a/python/cudf/cudf/_fuzz_testing/parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/parquet.py
@@ -12,7 +12,7 @@
     _generate_rand_meta,
     pyarrow_to_pandas,
 )
-from cudf.tests import dataset_generator as dg
+from cudf.testing import dataset_generator as dg
 
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
index e6a5d081980..ff2ccd19696 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
@@ -13,7 +13,7 @@
     compare_content,
     run_test,
 )
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 @pythonfuzz(data_handle=CSVReader)
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
index f3da03f447b..6c9d564d088 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
@@ -9,7 +9,7 @@
 from cudf._fuzz_testing.json import JSONReader, JSONWriter
 from cudf._fuzz_testing.main import pythonfuzz
 from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES, run_test
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 @pythonfuzz(data_handle=JSONReader)
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index efcbd8ca792..6869e40492c 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -9,7 +9,7 @@
 import pyorc
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 from cudf.utils.dtypes import (
     pandas_dtypes_to_cudf_dtypes,
     pyarrow_dtypes_to_pandas_dtypes,
diff --git a/python/cudf/cudf/tests/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
similarity index 100%
rename from python/cudf/cudf/tests/dataset_generator.py
rename to python/cudf/cudf/testing/dataset_generator.py
diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/testing/utils.py
similarity index 100%
rename from python/cudf/cudf/tests/utils.py
rename to python/cudf/cudf/testing/utils.py
diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py
index 0ba80278fca..721e13b670f 100644
--- a/python/cudf/cudf/tests/test_apply_rows.py
+++ b/python/cudf/cudf/tests/test_apply_rows.py
@@ -2,7 +2,7 @@
 
 import cudf
 from cudf.core.column import column
-from cudf.tests.utils import assert_eq, gen_rand_series
+from cudf.testing.utils import assert_eq, gen_rand_series
 
 
 def _kernel_multiply(a, b, out):
diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py
index 1f35bc93c78..eeacf05b33b 100644
--- a/python/cudf/cudf/tests/test_applymap.py
+++ b/python/cudf/cudf/tests/test_applymap.py
@@ -7,7 +7,7 @@
 import pytest
 
 from cudf import Series
-from cudf.tests import utils
+from cudf.testing import utils
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index 03f9cf1d7e5..71804cb717e 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -4,7 +4,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 from cudf.utils.utils import IS_NEP18_ACTIVE
 
 missing_arrfunc_cond = not IS_NEP18_ACTIVE
diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
index f9e0bb2ce8a..c888d32276b 100644
--- a/python/cudf/cudf/tests/test_array_ufunc.py
+++ b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -1,9 +1,10 @@
-import cudf
-import numpy as np
 import cupy as cp
+import numpy as np
 import pandas as pd
 import pytest
-from cudf.tests.utils import assert_eq
+
+import cudf
+from cudf.testing.utils import assert_eq
 
 
 @pytest.fixture
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index a52ee937574..050f98d5ed5 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -18,7 +18,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 def cudf_from_avro_util(schema, records):
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 579716f8277..c07efc90f10 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -14,7 +14,7 @@
 import cudf
 from cudf.core import Series
 from cudf.core.index import as_index
-from cudf.tests import utils
+from cudf.testing import utils
 from cudf.utils.dtypes import (
     BOOL_TYPES,
     DATETIME_TYPES,
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index a117c15f14d..042bdea81f5 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing.utils import assert_eq, assert_exceptions_equal
 
 
 @pytest.fixture
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 9509cabc117..10192ea85ba 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -8,7 +8,7 @@
 import cudf
 from cudf._lib.transform import mask_to_bools
 from cudf.core.column.column import as_column
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing.utils import assert_eq, assert_exceptions_equal
 from cudf.utils import dtypes as dtypeutils
 
 dtypes = sorted(
@@ -97,8 +97,8 @@ def test_column_series_multi_dim(data):
 @pytest.mark.parametrize(
     ("data", "error"),
     [
-        ([1, "1.0", "2", -3], TypeError),
-        ([np.nan, 0, "null", cp.nan], TypeError),
+        ([1, "1.0", "2", -3], pa.lib.ArrowInvalid),
+        ([np.nan, 0, "null", cp.nan], pa.lib.ArrowInvalid),
         (
             [np.int32(4), np.float64(1.5), np.float32(1.290994), np.int8(0)],
             None,
@@ -109,7 +109,7 @@ def test_column_mixed_dtype(data, error):
     if error is None:
         cudf.Series(data)
     else:
-        with pytest.raises(TypeError):
+        with pytest.raises(error):
             cudf.Series(data)
 
 
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index 86a7927dcac..7342c04d0db 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 simple_test_data = [
     {},
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index d0e31a82b28..beb505b34d0 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -7,7 +7,7 @@
 import pytest
 
 import cudf as gd
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing.utils import assert_eq, assert_exceptions_equal
 from cudf.utils.dtypes import is_categorical_dtype
 
 
diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py
index 4737faf65a4..ee5e87c2b2c 100644
--- a/python/cudf/cudf/tests/test_contains.py
+++ b/python/cudf/cudf/tests/test_contains.py
@@ -5,7 +5,7 @@
 
 from cudf import Series
 from cudf.core.index import RangeIndex, as_index
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 def cudf_date_series(start, stop, freq):
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index ed6a1169a2a..249cdd3c310 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core import Series
-from cudf.tests.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
+from cudf.testing.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index d972d2ad11c..3633873c73d 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -15,7 +15,7 @@
 
 import cudf
 from cudf import read_csv
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing.utils import assert_eq, assert_exceptions_equal
 
 
 def make_numeric_dataframe(nrows, dtype):
diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py
index fa880da6804..c608e67ac3e 100644
--- a/python/cudf/cudf/tests/test_cuda_apply.py
+++ b/python/cudf/cudf/tests/test_cuda_apply.py
@@ -9,7 +9,7 @@
 from numba import cuda
 
 from cudf import DataFrame
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 @pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129])
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 42e5ab38f50..47ce8ac1132 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -10,7 +10,7 @@
 from numba import cuda
 
 import cudf
-from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py
index d72b5875677..b032a8d0eda 100644
--- a/python/cudf/cudf/tests/test_custom_accessor.py
+++ b/python/cudf/cudf/tests/test_custom_accessor.py
@@ -2,9 +2,9 @@
 
 import pandas as pd
 import pytest
-import cudf as gd
 
-from cudf.tests.utils import assert_eq
+import cudf as gd
+from cudf.testing.utils import assert_eq
 
 
 @gd.api.extensions.register_dataframe_accessor("point")
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index b04d2ca3da8..60bd6a0b801 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -20,8 +20,8 @@
 import cudf
 from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120
 from cudf.core.column import column
-from cudf.tests import utils
-from cudf.tests.utils import (
+from cudf.testing import utils
+from cudf.testing.utils import (
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py
index 35788e660ea..8c9a1b42ae7 100644
--- a/python/cudf/cudf/tests/test_dataframe_copy.py
+++ b/python/cudf/cudf/tests/test_dataframe_copy.py
@@ -7,7 +7,7 @@
 from numba import cuda
 
 from cudf.core.dataframe import DataFrame
-from cudf.tests.utils import ALL_TYPES, assert_eq
+from cudf.testing.utils import ALL_TYPES, assert_eq
 
 """
 DataFrame copy expectations
diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py
index a603a6b4658..ccb66fc7306 100644
--- a/python/cudf/cudf/tests/test_datasets.py
+++ b/python/cudf/cudf/tests/test_datasets.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 import cudf as gd
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 def test_dataset_timeseries():
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 7eb8fcd0aa4..d572dbd4a36 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -14,7 +14,7 @@
 import cudf
 from cudf.core import DataFrame, Series
 from cudf.core.index import DatetimeIndex
-from cudf.tests.utils import (
+from cudf.testing.utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index b8175d05137..d72b6a49f72 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 nelems = [0, 3, 10]
 dtype = [np.uint16, np.int32, np.float64]
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index d01627309d6..684eed62168 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index b6e2aac0304..0f1a2b7fe59 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -9,11 +9,11 @@
 from cudf.core.dtypes import (
     CategoricalDtype,
     Decimal64Dtype,
+    IntervalDtype,
     ListDtype,
     StructDtype,
-    IntervalDtype,
 )
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 def test_cdt_basic():
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index f721b7a28e5..457adbd4836 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf import concat
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing.utils import assert_eq, assert_exceptions_equal
 
 # TODO: PANDAS 1.0 support
 # Revisit drop_duplicates() tests to update parameters like ignore_index.
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 61d11fa5961..60ba46277f4 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -7,7 +7,7 @@
 
 import cudf
 from cudf.core import DataFrame, Index
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py
index 525b88fc7ff..61362edb8b9 100644
--- a/python/cudf/cudf/tests/test_feather.py
+++ b/python/cudf/cudf/tests/test_feather.py
@@ -10,7 +10,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import NUMERIC_TYPES, assert_eq
+from cudf.testing.utils import NUMERIC_TYPES, assert_eq
 
 if LooseVersion(pd.__version__) < LooseVersion("0.24"):
     try:
diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py
index 83d15b36e64..e6904328065 100644
--- a/python/cudf/cudf/tests/test_fill.py
+++ b/python/cudf/cudf/tests/test_fill.py
@@ -2,7 +2,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index 5d287a57df8..181b31f5327 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -10,7 +10,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 gcsfs = pytest.importorskip("gcsfs")
 
diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
index e3c8e69695d..96b051a53b4 100644
--- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py
+++ b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
@@ -8,7 +8,7 @@
 
 import cudf
 from cudf.comm.gpuarrow import GpuArrowReader
-from cudf.tests.utils import INTEGER_TYPES
+from cudf.testing.utils import INTEGER_TYPES
 
 
 def make_gpu_parse_arrow_data_batch():
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 8011510d340..e6d91b87034 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -12,7 +12,7 @@
 import cudf
 from cudf.core import DataFrame, Series
 from cudf.core._compat import PANDAS_GE_110
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing.utils import assert_eq, assert_exceptions_equal
 
 _now = np.datetime64("now")
 _tomorrow = _now + np.timedelta64(1, "D")
diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py
index f908d5f51f5..f1d573a5ca2 100644
--- a/python/cudf/cudf/tests/test_hdf.py
+++ b/python/cudf/cudf/tests/test_hdf.py
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
 
 try:
     import tables  # noqa F401
diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py
index e3867c620fe..b26315b02fd 100644
--- a/python/cudf/cudf/tests/test_hdfs.py
+++ b/python/cudf/cudf/tests/test_hdfs.py
@@ -11,7 +11,7 @@
 from pyarrow import orc as orc
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 if not os.environ.get("RUN_HDFS_TESTS"):
     pytestmark = pytest.mark.skip("Env not configured to run HDFS tests")
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 688efef555b..c705ef98138 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -20,7 +20,7 @@
     RangeIndex,
     as_index,
 )
-from cudf.tests.utils import (
+from cudf.testing.utils import (
     FLOAT_TYPES,
     NUMERIC_TYPES,
     OTHER_TYPES,
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 73a074c0376..25a0694a4e5 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -9,8 +9,12 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120
-from cudf.tests import utils
-from cudf.tests.utils import INTEGER_TYPES, assert_eq, assert_exceptions_equal
+from cudf.testing import utils
+from cudf.testing.utils import (
+    INTEGER_TYPES,
+    assert_eq,
+    assert_exceptions_equal,
+)
 
 index_dtypes = INTEGER_TYPES
 
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index c7eafedd409..680ce6ee597 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -4,7 +4,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 969cf1bf549..4461a38fcf9 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -7,7 +7,7 @@
 import cudf
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.dtypes import CategoricalDtype
-from cudf.tests.utils import (
+from cudf.testing.utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index e0a922f35fe..ea85075d766 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -12,7 +12,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
 
 
 def make_numeric_dataframe(nrows, dtype):
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 195d8749ec6..07ddf0028f2 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index b26887ad6ae..86304fd3057 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -16,7 +16,7 @@
     RangeIndex,
     StringIndex,
 )
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)])
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index bd78612d6c7..182fd1dc6ea 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -15,7 +15,7 @@
 import cudf
 from cudf.core.column import as_column
 from cudf.core.index import as_index
-from cudf.tests.utils import assert_eq, assert_exceptions_equal, assert_neq
+from cudf.testing.utils import assert_eq, assert_exceptions_equal, assert_neq
 
 
 def test_multiindex_levels_codes_validation():
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index 6d9bcda2c0b..ed23741b39c 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_100
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 def test_can_cast_safely_same_kind():
diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py
index 521840f8a8a..951af77b155 100644
--- a/python/cudf/cudf/tests/test_numpy_interop.py
+++ b/python/cudf/cudf/tests/test_numpy_interop.py
@@ -2,7 +2,7 @@
 import pytest
 
 from cudf.core import DataFrame, Series
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 def test_to_records_noindex():
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index f7d9f03832a..1cc2d90e501 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -8,7 +8,7 @@
 
 import cudf
 from cudf.core import DataFrame, GenericIndex, Series
-from cudf.tests import utils
+from cudf.testing import utils
 
 
 def test_onehot_simple():
diff --git a/python/cudf/cudf/tests/test_ops.py b/python/cudf/cudf/tests/test_ops.py
index 8cdef19d9ba..b7228739cfa 100644
--- a/python/cudf/cudf/tests/test_ops.py
+++ b/python/cudf/cudf/tests/test_ops.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq, gen_rand
+from cudf.testing.utils import assert_eq, gen_rand
 
 
 def test_sqrt_float():
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index ed91e909f25..ff3c81a6cc1 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -13,7 +13,11 @@
 
 import cudf
 from cudf.io.orc import ORCWriter
-from cudf.tests.utils import assert_eq, gen_rand_series, supported_numpy_dtypes
+from cudf.testing.utils import (
+    assert_eq,
+    gen_rand_series,
+    supported_numpy_dtypes,
+)
 
 
 @pytest.fixture(scope="module")
diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py
index 24c60f12a2f..ac1b39c9219 100644
--- a/python/cudf/cudf/tests/test_pandas_interop.py
+++ b/python/cudf/cudf/tests/test_pandas_interop.py
@@ -5,7 +5,7 @@
 
 import cudf
 from cudf.core import DataFrame
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 def test_to_pandas():
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index dc4d0615a7f..526e5adfba9 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -17,8 +17,8 @@
 
 import cudf
 from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata
-from cudf.tests import dataset_generator as dg
-from cudf.tests.utils import assert_eq
+from cudf.testing import dataset_generator as dg
+from cudf.testing.utils import assert_eq
 
 
 @pytest.fixture(scope="module")
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index e87ab3730dd..6ca55e625bf 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -8,7 +8,7 @@
 
 from cudf.core import DataFrame, GenericIndex, Series
 from cudf.core.buffer import Buffer
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 if sys.version_info < (3, 8):
     try:
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index 49a2603b9a3..f72a8a5fc71 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -1,7 +1,7 @@
 import pandas as pd
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 def test_single_q():
diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
index b6915a63947..6e81f6f8457 100644
--- a/python/cudf/cudf/tests/test_query.py
+++ b/python/cudf/cudf/tests/test_query.py
@@ -12,7 +12,7 @@
 
 import cudf
 from cudf.core import DataFrame
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 from cudf.utils import queryutils
 
 _params_query_parser = []
diff --git a/python/cudf/cudf/tests/test_query_mask.py b/python/cudf/cudf/tests/test_query_mask.py
index 35479f8308c..08218a3bdbf 100644
--- a/python/cudf/cudf/tests/test_query_mask.py
+++ b/python/cudf/cudf/tests/test_query_mask.py
@@ -3,7 +3,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 _data = [
     {"a": [0, 1.0, 2.0, None, np.nan, None, 3, 5]},
diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py
index c86b2c61aa5..dfd8d4824cd 100644
--- a/python/cudf/cudf/tests/test_rank.py
+++ b/python/cudf/cudf/tests/test_rank.py
@@ -7,7 +7,7 @@
 import pytest
 
 from cudf.core import DataFrame
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing.utils import assert_eq, assert_exceptions_equal
 
 
 @pytest.fixture
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 80a2e89bf46..53b07a253ff 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -11,8 +11,8 @@
 
 import cudf
 from cudf.core import Series
-from cudf.tests import utils
-from cudf.tests.utils import NUMERIC_TYPES, gen_rand
+from cudf.testing import utils
+from cudf.testing.utils import NUMERIC_TYPES, gen_rand
 
 params_dtype = NUMERIC_TYPES
 
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index e7baa4ee926..7c7dd948e13 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -8,7 +8,7 @@
 
 import cudf
 from cudf.core import DataFrame, Series
-from cudf.tests.utils import (
+from cudf.testing.utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 7c274734980..d9de9335889 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -10,7 +10,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.tests import utils
+from cudf.testing import utils
 from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes
 
 repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index b030924779d..d25a2dd68ac 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -9,7 +9,7 @@
 import cudf
 from cudf import melt as cudf_melt
 from cudf.core._compat import PANDAS_GE_120
-from cudf.tests.utils import (
+from cudf.testing.utils import (
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index fcc5591adda..76e09eb5069 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -8,7 +8,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 2eefcfef7d2..300a4f6e917 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -14,7 +14,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 moto = pytest.importorskip("moto", minversion="1.3.14")
 boto3 = pytest.importorskip("boto3")
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 003e46c7e0d..6e8830fb207 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -9,7 +9,7 @@
 import cudf
 from cudf import Scalar as pycudf_scalar
 from cudf._lib.copying import get_element
-from cudf.tests.utils import (
+from cudf.testing.utils import (
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py
index dce65947460..33e9953c9ab 100644
--- a/python/cudf/cudf/tests/test_scan.py
+++ b/python/cudf/cudf/tests/test_scan.py
@@ -5,7 +5,12 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import INTEGER_TYPES, NUMERIC_TYPES, assert_eq, gen_rand
+from cudf.testing.utils import (
+    INTEGER_TYPES,
+    NUMERIC_TYPES,
+    assert_eq,
+    gen_rand,
+)
 
 params_sizes = [0, 1, 2, 5]
 
diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py
index 4c42e2cb50f..28da93d3401 100644
--- a/python/cudf/cudf/tests/test_search.py
+++ b/python/cudf/cudf/tests/test_search.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq, gen_rand, random_bitmask
+from cudf.testing.utils import assert_eq, gen_rand, random_bitmask
 
 
 @pytest.mark.parametrize("side", ["left", "right"])
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index 656b66bf793..d76575eb8cc 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -8,8 +8,8 @@
 import pytest
 
 import cudf
-from cudf.tests import utils
-from cudf.tests.utils import assert_eq
+from cudf.testing import utils
+from cudf.testing.utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index ab9d3d91f73..5b9ecc98b40 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -9,7 +9,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import (
+from cudf.testing.utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,
diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py
index 324074b6021..be08b3ba9a4 100644
--- a/python/cudf/cudf/tests/test_seriesmap.py
+++ b/python/cudf/cudf/tests/test_seriesmap.py
@@ -4,12 +4,12 @@
 from math import floor
 
 import numpy as np
-import cudf
 import pandas as pd
 import pytest
 
+import cudf
 from cudf import Series
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing.utils import assert_eq, assert_exceptions_equal
 
 
 def test_series_map_basic():
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index fc885a13808..cd4eb577d43 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_120
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing.utils import assert_eq, assert_exceptions_equal
 
 
 @pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})])
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index b90aebc33dc..5c8b278aaba 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -9,7 +9,7 @@
 
 from cudf.core import DataFrame, Series
 from cudf.core.column import NumericalColumn
-from cudf.tests.utils import (
+from cudf.testing.utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py
index 4551f48845f..23d149fe78d 100644
--- a/python/cudf/cudf/tests/test_sparse_df.py
+++ b/python/cudf/cudf/tests/test_sparse_df.py
@@ -8,7 +8,7 @@
 
 from cudf.comm.gpuarrow import GpuArrowReader
 from cudf.core import DataFrame, Series
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 def read_data():
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 4e07c974280..ffa605d5782 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.datasets import randomdata
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing.utils import assert_eq, assert_exceptions_equal
 
 params_dtypes = [np.int32, np.uint32, np.float32, np.float64]
 methods = ["min", "max", "sum", "mean", "var", "std"]
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index a015f3387b4..09c6a0ee146 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -16,7 +16,7 @@
 from cudf.core._compat import PANDAS_GE_110
 from cudf.core.column.string import StringColumn
 from cudf.core.index import StringIndex, as_index
-from cudf.tests.utils import (
+from cudf.testing.utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index c7efb55c089..abfe1f3a73c 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -4,7 +4,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
index eee7078433d..a90c991937b 100644
--- a/python/cudf/cudf/tests/test_testing.py
+++ b/python/cudf/cudf/tests/test_testing.py
@@ -10,7 +10,7 @@
     assert_index_equal,
     assert_series_equal,
 )
-from cudf.tests.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
+from cudf.testing.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
 
 
 @pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]])
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 74465c4a54d..be01e6f7c48 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 def test_tokenize():
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 3efc30af01e..524731295fc 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -11,8 +11,8 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_120
-from cudf.tests import utils as utils
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.testing import utils as utils
+from cudf.testing.utils import assert_eq, assert_exceptions_equal
 
 _TIMEDELTA_DATA = [
     [1000000, 200000, 3000000],
diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py
index 6ec5f88be48..3e77f74d350 100644
--- a/python/cudf/cudf/tests/test_transform.py
+++ b/python/cudf/cudf/tests/test_transform.py
@@ -6,7 +6,7 @@
 import pytest
 
 from cudf.core import Series
-from cudf.tests.utils import NUMERIC_TYPES
+from cudf.testing.utils import NUMERIC_TYPES
 
 supported_types = NUMERIC_TYPES
 
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index f132271cfd8..61e437ef9e8 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -10,7 +10,7 @@
 
 import cudf
 from cudf.core import Series
-from cudf.tests import utils
+from cudf.testing import utils
 
 _unaops = [operator.abs, operator.invert, operator.neg, np.ceil, np.floor]
 
diff --git a/python/custreamz/custreamz/tests/test_kafka.py b/python/custreamz/custreamz/tests/test_kafka.py
index 059655d4ca0..157b021a0b7 100644
--- a/python/custreamz/custreamz/tests/test_kafka.py
+++ b/python/custreamz/custreamz/tests/test_kafka.py
@@ -2,7 +2,7 @@
 import confluent_kafka as ck
 import pytest
 
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 
 @pytest.mark.parametrize("commit_offset", [-1, 0, 1, 1000])
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 76589682717..0ede420ed2e 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -8,7 +8,7 @@
 import dask_cudf as dgd
 
 from cudf import DataFrame, Series
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 #############################################################################
 #                        Datetime Accessor                                  #
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index e19fe016cc9..6091d0a5681 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -12,10 +12,10 @@
 from dask.dataframe.core import make_meta, meta_nonempty
 from dask.utils import M
 
-import cudf
-
 import dask_cudf as dgd
 
+import cudf
+
 
 def test_from_cudf():
     np.random.seed(0)
@@ -706,7 +706,7 @@ def test_dataframe_set_index():
 
     pddf = dd.from_pandas(pdf, npartitions=4)
     pddf = pddf.set_index("str")
-    from cudf.tests.utils import assert_eq
+    from cudf.testing.utils import assert_eq
 
     assert_eq(ddf.compute(), pddf.compute())
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index cb3c696adc3..7e4adace212 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -9,7 +9,7 @@
 import dask_cudf
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.testing.utils import assert_eq
 
 dask_cuda = pytest.importorskip("dask_cuda")
 

From b49ed25b68b279167d3b2d143193f4e212fc42d9 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 4 Mar 2021 15:34:48 -0600
Subject: [PATCH 03/54] add workaround for creating an arrow table from
 CUDABuffer

---
 python/cudf/cudf/comm/gpuarrow.py             | 29 ++++++++++++-------
 .../cudf/cudf/tests/test_gpu_arrow_parser.py  |  4 ++-
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py
index 451572224c6..16ddb582605 100644
--- a/python/cudf/cudf/comm/gpuarrow.py
+++ b/python/cudf/cudf/comm/gpuarrow.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+
 from collections import OrderedDict
 from collections.abc import Sequence
 
@@ -9,7 +10,7 @@
 from cudf._lib.gpuarrow import (
     CudaRecordBatchStreamReader as _CudaRecordBatchStreamReader,
 )
-from cudf.core import Series, column
+from cudf.core import DataFrame, Series, column
 from cudf.utils.utils import mask_bitsize, mask_dtype
 
 
@@ -33,19 +34,25 @@ def __init__(self, source, schema=None):
 
 class GpuArrowReader(Sequence):
     def __init__(self, schema, dev_ary):
-        self._table = CudaRecordBatchStreamReader(dev_ary, schema).read_all()
+        table = CudaRecordBatchStreamReader(dev_ary, schema).read_all()
+        self._df = DataFrame.from_arrow(table)
+        self._schema = pa.Schema.from_pandas(self._df)
 
     def __len__(self):
-        return self._table.num_columns
+        return len(self._df._data.names)
 
     def __getitem__(self, idx):
-        return GpuArrowNodeReader(self._table, idx)
+        return GpuArrowNodeReader(
+            schema=self._schema,
+            field=self._schema[idx],
+            series=self._df._data.columns[idx],
+        )
 
     def schema(self):
         """
         Return a pyarrow schema
         """
-        return self._table.schema
+        return self._schema
 
     def to_dict(self):
         """
@@ -58,10 +65,10 @@ def to_dict(self):
 
 
 class GpuArrowNodeReader(object):
-    def __init__(self, table, index):
-        self._table = table
-        self._field = table.schema[index]
-        self._series = Series(column.as_column(table.column(index)))
+    def __init__(self, schema, field, series):
+        self._schema = schema
+        self._field = field
+        self._series = Series(column.as_column(series))
         self._series.name = self.name
 
     def __len__(self):
@@ -69,7 +76,7 @@ def __len__(self):
 
     @property
     def schema(self):
-        return self._table.schema
+        return self._schema
 
     @property
     def field_schema(self):
diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
index 96b051a53b4..67c83b9a917 100644
--- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py
+++ b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+
 import logging
 
 import numpy as np
@@ -178,6 +179,7 @@ def make_gpu_parse_arrow_cats_batch():
 
 
 def test_gpu_parse_arrow_cats():
+    pytest.xfail(reason="need dictionary mapping in libcudf from_arrow")
     batch = make_gpu_parse_arrow_cats_batch()
 
     stream = pa.BufferOutputStream()

From c917739f5c1842988cda885bf51b3ad41d8e9601 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 4 Mar 2021 18:24:14 -0600
Subject: [PATCH 04/54] revert cat changes, and make testing utils internal

---
 .../cudf/_fuzz_testing/tests/fuzz_test_csv.py |  2 +-
 .../_fuzz_testing/tests/fuzz_test_json.py     |  2 +-
 python/cudf/cudf/_fuzz_testing/utils.py       |  2 +-
 python/cudf/cudf/_lib/transpose.pyx           |  6 +--
 python/cudf/cudf/_lib/utils.pyx               |  8 +++-
 python/cudf/cudf/core/column/categorical.py   | 37 +++++++++++--------
 python/cudf/cudf/core/column/column.py        | 16 +++++---
 python/cudf/cudf/core/dataframe.py            |  7 +++-
 python/cudf/cudf/core/frame.py                |  7 ++--
 python/cudf/cudf/core/index.py                | 14 ++++---
 python/cudf/cudf/io/csv.py                    |  2 +-
 .../cudf/cudf/testing/{utils.py => _utils.py} |  0
 python/cudf/cudf/testing/testing.py           |  4 +-
 python/cudf/cudf/tests/test_apply_rows.py     |  2 +-
 python/cudf/cudf/tests/test_array_function.py |  2 +-
 python/cudf/cudf/tests/test_array_ufunc.py    |  2 +-
 .../test_avro_reader_fastavro_integration.py  |  2 +-
 python/cudf/cudf/tests/test_categorical.py    |  2 +-
 python/cudf/cudf/tests/test_column.py         |  2 +-
 .../cudf/cudf/tests/test_column_accessor.py   |  2 +-
 python/cudf/cudf/tests/test_concat.py         |  2 +-
 python/cudf/cudf/tests/test_contains.py       |  2 +-
 python/cudf/cudf/tests/test_copying.py        |  2 +-
 python/cudf/cudf/tests/test_csv.py            |  2 +-
 python/cudf/cudf/tests/test_cuda_apply.py     |  2 +-
 .../cudf/tests/test_cuda_array_interface.py   |  2 +-
 .../cudf/cudf/tests/test_custom_accessor.py   |  2 +-
 python/cudf/cudf/tests/test_dataframe.py      |  6 +--
 python/cudf/cudf/tests/test_dataframe_copy.py |  2 +-
 python/cudf/cudf/tests/test_datasets.py       |  2 +-
 python/cudf/cudf/tests/test_datetime.py       |  2 +-
 python/cudf/cudf/tests/test_dlpack.py         |  2 +-
 python/cudf/cudf/tests/test_dropna.py         |  2 +-
 python/cudf/cudf/tests/test_dtypes.py         |  2 +-
 python/cudf/cudf/tests/test_duplicates.py     |  2 +-
 python/cudf/cudf/tests/test_factorize.py      |  2 +-
 python/cudf/cudf/tests/test_feather.py        |  2 +-
 python/cudf/cudf/tests/test_fill.py           |  2 +-
 python/cudf/cudf/tests/test_gcs.py            |  2 +-
 .../cudf/cudf/tests/test_gpu_arrow_parser.py  |  2 +-
 python/cudf/cudf/tests/test_groupby.py        |  2 +-
 python/cudf/cudf/tests/test_hdf.py            |  2 +-
 python/cudf/cudf/tests/test_hdfs.py           |  2 +-
 python/cudf/cudf/tests/test_index.py          |  2 +-
 python/cudf/cudf/tests/test_indexing.py       |  2 +-
 python/cudf/cudf/tests/test_interval.py       |  2 +-
 python/cudf/cudf/tests/test_joining.py        |  2 +-
 python/cudf/cudf/tests/test_json.py           |  2 +-
 python/cudf/cudf/tests/test_list.py           |  2 +-
 python/cudf/cudf/tests/test_monotonic.py      |  2 +-
 python/cudf/cudf/tests/test_multiindex.py     |  2 +-
 python/cudf/cudf/tests/test_numerical.py      |  2 +-
 python/cudf/cudf/tests/test_numpy_interop.py  |  2 +-
 python/cudf/cudf/tests/test_ops.py            |  2 +-
 python/cudf/cudf/tests/test_orc.py            |  2 +-
 python/cudf/cudf/tests/test_pandas_interop.py |  2 +-
 python/cudf/cudf/tests/test_parquet.py        |  2 +-
 python/cudf/cudf/tests/test_pickling.py       |  2 +-
 python/cudf/cudf/tests/test_quantiles.py      |  2 +-
 python/cudf/cudf/tests/test_query.py          |  2 +-
 python/cudf/cudf/tests/test_query_mask.py     |  2 +-
 python/cudf/cudf/tests/test_rank.py           |  2 +-
 python/cudf/cudf/tests/test_reductions.py     |  2 +-
 python/cudf/cudf/tests/test_replace.py        |  2 +-
 python/cudf/cudf/tests/test_reshape.py        |  2 +-
 python/cudf/cudf/tests/test_rolling.py        |  2 +-
 python/cudf/cudf/tests/test_s3.py             |  2 +-
 python/cudf/cudf/tests/test_scalar.py         |  2 +-
 python/cudf/cudf/tests/test_scan.py           |  2 +-
 python/cudf/cudf/tests/test_search.py         |  2 +-
 python/cudf/cudf/tests/test_serialize.py      |  2 +-
 python/cudf/cudf/tests/test_series.py         |  2 +-
 python/cudf/cudf/tests/test_seriesmap.py      |  2 +-
 python/cudf/cudf/tests/test_setitem.py        |  2 +-
 python/cudf/cudf/tests/test_sorting.py        |  2 +-
 python/cudf/cudf/tests/test_sparse_df.py      |  2 +-
 python/cudf/cudf/tests/test_stats.py          |  2 +-
 python/cudf/cudf/tests/test_string.py         |  2 +-
 python/cudf/cudf/tests/test_struct.py         |  2 +-
 python/cudf/cudf/tests/test_testing.py        |  2 +-
 python/cudf/cudf/tests/test_text.py           |  2 +-
 python/cudf/cudf/tests/test_timedelta.py      |  2 +-
 python/cudf/cudf/tests/test_transform.py      |  2 +-
 .../custreamz/custreamz/tests/test_kafka.py   |  2 +-
 .../dask_cudf/tests/test_accessor.py          |  2 +-
 python/dask_cudf/dask_cudf/tests/test_core.py |  2 +-
 .../dask_cudf/tests/test_distributed.py       |  2 +-
 87 files changed, 142 insertions(+), 117 deletions(-)
 rename python/cudf/cudf/testing/{utils.py => _utils.py} (100%)

diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
index ff2ccd19696..9b6abeb1276 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
@@ -13,7 +13,7 @@
     compare_content,
     run_test,
 )
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pythonfuzz(data_handle=CSVReader)
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
index 6c9d564d088..2f5e6204f7c 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
@@ -9,7 +9,7 @@
 from cudf._fuzz_testing.json import JSONReader, JSONWriter
 from cudf._fuzz_testing.main import pythonfuzz
 from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES, run_test
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pythonfuzz(data_handle=JSONReader)
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 6869e40492c..f1b95173c3d 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -9,7 +9,7 @@
 import pyorc
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 from cudf.utils.dtypes import (
     pandas_dtypes_to_cudf_dtypes,
     pyarrow_dtypes_to_pandas_dtypes,
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index fd02a08c49e..d2b053789cd 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -36,11 +36,11 @@ def transpose(Table source):
     if is_categorical_dtype(dtype):
         if any(not is_categorical_dtype(c.dtype) for c in source._columns):
             raise ValueError('Columns must all have the same dtype')
-        cats = list(c.cat.categories for c in source._columns)
+        cats = list(c.cat().categories for c in source._columns)
         cats = cudf.Series(cudf.concat(cats)).drop_duplicates()._column
         source = Table(index=source._index, data=[
-            (name, col.cat._set_categories(
-                col.cat.categories, cats, is_unique=True).codes)
+            (name, col.cat()._set_categories(
+                col.cat().categories, cats, is_unique=True).codes)
             for name, col in source._data.items()
         ])
     elif dtype.kind in 'OU':
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 26101c8bf7f..58e6f5f454f 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -116,7 +116,13 @@ cpdef generate_pandas_metadata(Table table, index):
             index_descriptors.append(descr)
 
     metadata = pa.pandas_compat.construct_metadata(
-        columns_to_convert=table._data.columns,
+        columns_to_convert=[
+            col.to_pandas()
+            if isinstance(col, cudf.core.column.CategoricalColumn)
+            else
+            col
+            for col in table._data.columns
+        ],
         df=table,
         column_names=col_names,
         index_levels=index_levels,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 7f8f1e7228f..e0e56edb2f1 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -936,9 +936,9 @@ def ordered(self) -> Optional[bool]:
     def ordered(self, value: bool):
         self.dtype.ordered = value
 
-    @property
-    def cat(self):
-        return CategoricalAccessor(self, parent=None)
+    # @property
+    def cat(self, parent: ParentType = None):
+        return CategoricalAccessor(self, parent=parent)
 
     def unary_operator(self, unaryop: str):
         raise TypeError(
@@ -1086,7 +1086,7 @@ def to_pandas(
             col = self
 
         signed_dtype = min_signed_type(len(col.categories))
-        codes = col.cat.codes.astype(signed_dtype).fillna(-1).to_array()
+        codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array()
         categories = col.categories.dropna(drop_nan=True).to_pandas()
         data = pd.Categorical.from_codes(
             codes, categories=categories, ordered=col.ordered
@@ -1199,11 +1199,13 @@ def find_and_replace(
         # named 'index', which came from the filtered categories,
         # contains the new ints that we need to map to
         to_replace_col = column.as_column(catmap.index).astype(
-            self.cat.codes.dtype
+            self.cat().codes.dtype
+        )
+        replacement_col = catmap["index"]._column.astype(
+            self.cat().codes.dtype
         )
-        replacement_col = catmap["index"]._column.astype(self.cat.codes.dtype)
 
-        replaced = column.as_column(self.cat.codes)
+        replaced = column.as_column(self.cat().codes)
         output = libcudf.replace.replace(
             replaced, to_replace_col, replacement_col
         )
@@ -1281,8 +1283,10 @@ def fillna(
                         )
                 # TODO: only required if fill_value has a subset of the
                 # categories:
-                fill_value = fill_value.cat._set_categories(
-                    fill_value.cat.categories, self.categories, is_unique=True,
+                fill_value = fill_value.cat()._set_categories(
+                    fill_value.cat().categories,
+                    self.categories,
+                    is_unique=True,
                 )
                 fill_value = column.as_column(fill_value.codes).astype(
                     self.codes.dtype
@@ -1360,7 +1364,7 @@ def as_categorical_column(
             # return a column full of Nulls.
             return _create_empty_categorical_column(self, dtype)
 
-        return self.cat.set_categories(
+        return self.cat().set_categories(
             new_categories=dtype.categories, ordered=dtype.ordered
         )
 
@@ -1385,8 +1389,8 @@ def as_timedelta_column(self, dtype, **kwargs) -> TimeDeltaColumn:
     def _get_decategorized_column(self) -> ColumnBase:
         if self.null_count == len(self):
             # self.categories is empty; just return codes
-            return self.cat.codes._column
-        gather_map = self.cat.codes.astype("int32").fillna(0)._column
+            return self.cat().codes._column
+        gather_map = self.cat().codes.astype("int32").fillna(0)._column
         out = self.categories.take(gather_map)
         out = out.set_mask(self.mask)
         return out
@@ -1419,7 +1423,9 @@ def copy(self, deep: bool = True) -> CategoricalColumn:
             )
 
     def __sizeof__(self) -> int:
-        return self.cat.categories.__sizeof__() + self.cat.codes.__sizeof__()
+        return (
+            self.cat().categories.__sizeof__() + self.cat().codes.__sizeof__()
+        )
 
     def _memory_usage(self, **kwargs) -> int:
         deep = kwargs.get("deep", False)
@@ -1427,7 +1433,8 @@ def _memory_usage(self, **kwargs) -> int:
             return self.__sizeof__()
         else:
             return (
-                self.categories._memory_usage() + self.cat.codes.memory_usage()
+                self.categories._memory_usage()
+                + self.cat().codes.memory_usage()
             )
 
     def _mimic_inplace(
@@ -1453,7 +1460,7 @@ def _create_empty_categorical_column(
             cudf.utils.utils.scalar_broadcast_to(
                 categorical_column.default_na_value(),
                 categorical_column.size,
-                np.dtype(categorical_column.cat.codes),
+                np.dtype(categorical_column.cat().codes),
             )
         ),
         offset=categorical_column.offset,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index eec28c23bea..8e691773e3a 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -189,8 +189,10 @@ def __sizeof__(self) -> int:
             n += bitmask_allocation_size_bytes(self.size)
         return n
 
-    @property
-    def cat(self) -> "cudf.core.column.categorical.CategoricalAccessor":
+    # @property
+    def cat(
+        self, parent=None
+    ) -> "cudf.core.column.categorical.CategoricalAccessor":
         raise NotImplementedError()
 
     def str(self, parent=None) -> "cudf.core.column.string.StringMethods":
@@ -253,19 +255,21 @@ def _concat(
         if is_categorical:
             # Combine and de-dupe the categories
             cats = (
-                cudf.concat([o.cat.categories for o in objs])
+                cudf.concat([o.cat().categories for o in objs])
                 .to_series()
                 .drop_duplicates(ignore_index=True)
                 ._column
             )
             objs = [
-                o.cat._set_categories(o.cat.categories, cats, is_unique=True)
+                o.cat()._set_categories(
+                    o.cat().categories, cats, is_unique=True
+                )
                 for o in objs
             ]
             # Map `objs` into a list of the codes until we port Categorical to
             # use the libcudf++ Category data type.
-            objs = [o.cat.codes._column for o in objs]
-            head = head.cat.codes._column
+            objs = [o.cat().codes._column for o in objs]
+            head = head.cat().codes._column
 
         newsize = sum(map(len, objs))
         if newsize > libcudf.MAX_COLUMN_SIZE:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c0e1b8995aa..955b9bfbaa6 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5396,7 +5396,12 @@ def to_arrow(self, preserve_index=True):
 
         out = super(DataFrame, data).to_arrow()
         metadata = pa.pandas_compat.construct_metadata(
-            columns_to_convert=self._data.columns,
+            columns_to_convert=[
+                col.to_pandas()
+                if isinstance(col, cudf.core.column.CategoricalColumn)
+                else col
+                for col in self._data.columns
+            ],
             df=self,
             column_names=out.schema.names,
             index_levels=[self.index],
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index f413537bbeb..926aad368b0 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -3792,7 +3792,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
         ):
             # Combine and de-dupe the categories
             categories[idx] = (
-                cudf.concat([col.cat.categories for col in cols])
+                cudf.concat([col.cat().categories for col in cols])
                 .to_series()
                 .drop_duplicates(ignore_index=True)
                 ._column
@@ -3823,8 +3823,9 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories):
                 if idx in categories:
                     cols[idx] = (
                         cols[idx]
-                        .cat._set_categories(
-                            cols[idx].cat.categories,
+                        .cat()
+                        ._set_categories(
+                            cols[idx].cat().categories,
                             categories[idx],
                             is_unique=True,
                         )
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b605d51af90..8c86352b2a7 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2672,15 +2672,17 @@ def __new__(
             dtype = None
 
         if categories is not None:
-            data.cat.set_categories(categories, ordered=ordered, inplace=True)
+            data.cat().set_categories(
+                categories, ordered=ordered, inplace=True
+            )
         elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)):
-            data.cat.set_categories(
+            data.cat().set_categories(
                 dtype.categories, ordered=ordered, inplace=True
             )
         elif ordered is True and data.ordered is False:
-            data.cat.as_ordered(inplace=True)
+            data.cat().as_ordered(inplace=True)
         elif ordered is False and data.ordered is True:
-            data.cat.as_unordered(inplace=True)
+            data.cat().as_unordered(inplace=True)
 
         out._initialize(data, **kwargs)
 
@@ -2691,14 +2693,14 @@ def codes(self):
         """
         The category codes of this categorical.
         """
-        return self._values.cat.codes
+        return self._values.cat().codes
 
     @property
     def categories(self):
         """
         The categories of this categorical.
         """
-        return self._values.cat.categories
+        return self._values.cat().categories
 
 
 class StringIndex(GenericIndex):
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index f9b60cfce0b..e2c7ca7dca1 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -177,7 +177,7 @@ def to_csv(
         df = df.copy(deep=False)
         for col_name, col in df._data.items():
             if isinstance(col, cudf.core.column.CategoricalColumn):
-                df._data[col_name] = col.astype(col.cat.categories.dtype)
+                df._data[col_name] = col.astype(col.cat().categories.dtype)
 
         if isinstance(df.index, cudf.CategoricalIndex):
             df.index = df.index.astype(df.index.categories.dtype)
diff --git a/python/cudf/cudf/testing/utils.py b/python/cudf/cudf/testing/_utils.py
similarity index 100%
rename from python/cudf/cudf/testing/utils.py
rename to python/cudf/cudf/testing/_utils.py
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 9930327d89d..bacab24a6f3 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -162,8 +162,8 @@ def assert_column_equal(
 
     if check_exact and check_categorical:
         if is_categorical_dtype(left) and is_categorical_dtype(right):
-            left_cat = left.cat.categories
-            right_cat = right.cat.categories
+            left_cat = left.cat().categories
+            right_cat = right.cat().categories
 
             if check_category_order:
                 assert_index_equal(
diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py
index 721e13b670f..f025549971f 100644
--- a/python/cudf/cudf/tests/test_apply_rows.py
+++ b/python/cudf/cudf/tests/test_apply_rows.py
@@ -2,7 +2,7 @@
 
 import cudf
 from cudf.core.column import column
-from cudf.testing.utils import assert_eq, gen_rand_series
+from cudf.testing._utils import assert_eq, gen_rand_series
 
 
 def _kernel_multiply(a, b, out):
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index 71804cb717e..cd4dd28f179 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -4,7 +4,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 from cudf.utils.utils import IS_NEP18_ACTIVE
 
 missing_arrfunc_cond = not IS_NEP18_ACTIVE
diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
index c888d32276b..8cfcf4d2b6d 100644
--- a/python/cudf/cudf/tests/test_array_ufunc.py
+++ b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -4,7 +4,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.fixture
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 050f98d5ed5..48e3b0ec42c 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -18,7 +18,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def cudf_from_avro_util(schema, records):
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 042bdea81f5..eaafcc468b2 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.testing.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
 @pytest.fixture
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 10192ea85ba..e1410423387 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -8,7 +8,7 @@
 import cudf
 from cudf._lib.transform import mask_to_bools
 from cudf.core.column.column import as_column
-from cudf.testing.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 from cudf.utils import dtypes as dtypeutils
 
 dtypes = sorted(
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index 7342c04d0db..99d4bdd9910 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 simple_test_data = [
     {},
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index beb505b34d0..88cd1cadeb8 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -7,7 +7,7 @@
 import pytest
 
 import cudf as gd
-from cudf.testing.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 from cudf.utils.dtypes import is_categorical_dtype
 
 
diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py
index ee5e87c2b2c..1d1deca5bd6 100644
--- a/python/cudf/cudf/tests/test_contains.py
+++ b/python/cudf/cudf/tests/test_contains.py
@@ -5,7 +5,7 @@
 
 from cudf import Series
 from cudf.core.index import RangeIndex, as_index
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def cudf_date_series(start, stop, freq):
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index 249cdd3c310..0965b5298a4 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core import Series
-from cudf.testing.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 3633873c73d..e01a8387abd 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -15,7 +15,7 @@
 
 import cudf
 from cudf import read_csv
-from cudf.testing.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
 def make_numeric_dataframe(nrows, dtype):
diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py
index c608e67ac3e..2604030097b 100644
--- a/python/cudf/cudf/tests/test_cuda_apply.py
+++ b/python/cudf/cudf/tests/test_cuda_apply.py
@@ -9,7 +9,7 @@
 from numba import cuda
 
 from cudf import DataFrame
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129])
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 47ce8ac1132..041dc0076f8 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -10,7 +10,7 @@
 from numba import cuda
 
 import cudf
-from cudf.testing.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py
index b032a8d0eda..16e5b345ce2 100644
--- a/python/cudf/cudf/tests/test_custom_accessor.py
+++ b/python/cudf/cudf/tests/test_custom_accessor.py
@@ -4,7 +4,7 @@
 import pytest
 
 import cudf as gd
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @gd.api.extensions.register_dataframe_accessor("point")
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 60bd6a0b801..5073dd5b1e2 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -21,7 +21,7 @@
 from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120
 from cudf.core.column import column
 from cudf.testing import utils
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
@@ -5149,8 +5149,8 @@ def test_memory_usage_cat():
     gdf = cudf.from_pandas(df)
 
     expected = (
-        gdf.B._column.cat.categories.__sizeof__()
-        + gdf.B._column.cat.codes.__sizeof__()
+        gdf.B._column.cat().categories.__sizeof__()
+        + gdf.B._column.cat().codes.__sizeof__()
     )
 
     # Check cat column
diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py
index 8c9a1b42ae7..5b258c760b3 100644
--- a/python/cudf/cudf/tests/test_dataframe_copy.py
+++ b/python/cudf/cudf/tests/test_dataframe_copy.py
@@ -7,7 +7,7 @@
 from numba import cuda
 
 from cudf.core.dataframe import DataFrame
-from cudf.testing.utils import ALL_TYPES, assert_eq
+from cudf.testing._utils import ALL_TYPES, assert_eq
 
 """
 DataFrame copy expectations
diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py
index ccb66fc7306..b7bc89f008d 100644
--- a/python/cudf/cudf/tests/test_datasets.py
+++ b/python/cudf/cudf/tests/test_datasets.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 import cudf as gd
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def test_dataset_timeseries():
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index d572dbd4a36..8a65ed836f2 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -14,7 +14,7 @@
 import cudf
 from cudf.core import DataFrame, Series
 from cudf.core.index import DatetimeIndex
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index d72b6a49f72..4b2fca0d12d 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 nelems = [0, 3, 10]
 dtype = [np.uint16, np.int32, np.float64]
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index 684eed62168..e1d0c38c760 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index 0f1a2b7fe59..0e547d97a32 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -13,7 +13,7 @@
     ListDtype,
     StructDtype,
 )
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def test_cdt_basic():
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index 457adbd4836..f464ac1a6c2 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf import concat
-from cudf.testing.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 # TODO: PANDAS 1.0 support
 # Revisit drop_duplicates() tests to update parameters like ignore_index.
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 60ba46277f4..0010079ac79 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -7,7 +7,7 @@
 
 import cudf
 from cudf.core import DataFrame, Index
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py
index 61362edb8b9..6c83ee3c458 100644
--- a/python/cudf/cudf/tests/test_feather.py
+++ b/python/cudf/cudf/tests/test_feather.py
@@ -10,7 +10,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, assert_eq
 
 if LooseVersion(pd.__version__) < LooseVersion("0.24"):
     try:
diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py
index e6904328065..efbe2834486 100644
--- a/python/cudf/cudf/tests/test_fill.py
+++ b/python/cudf/cudf/tests/test_fill.py
@@ -2,7 +2,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index 181b31f5327..99d79e41520 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -10,7 +10,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 gcsfs = pytest.importorskip("gcsfs")
 
diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
index 67c83b9a917..baf2fa62e38 100644
--- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py
+++ b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.comm.gpuarrow import GpuArrowReader
-from cudf.testing.utils import INTEGER_TYPES
+from cudf.testing._utils import INTEGER_TYPES
 
 
 def make_gpu_parse_arrow_data_batch():
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index e6d91b87034..e5309e3b8b9 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -12,7 +12,7 @@
 import cudf
 from cudf.core import DataFrame, Series
 from cudf.core._compat import PANDAS_GE_110
-from cudf.testing.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 _now = np.datetime64("now")
 _tomorrow = _now + np.timedelta64(1, "D")
diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py
index f1d573a5ca2..1bf91a52c2f 100644
--- a/python/cudf/cudf/tests/test_hdf.py
+++ b/python/cudf/cudf/tests/test_hdf.py
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
 
 try:
     import tables  # noqa F401
diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py
index b26315b02fd..24554f113bb 100644
--- a/python/cudf/cudf/tests/test_hdfs.py
+++ b/python/cudf/cudf/tests/test_hdfs.py
@@ -11,7 +11,7 @@
 from pyarrow import orc as orc
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 if not os.environ.get("RUN_HDFS_TESTS"):
     pytestmark = pytest.mark.skip("Env not configured to run HDFS tests")
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index c705ef98138..2bd3d4d09ce 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -20,7 +20,7 @@
     RangeIndex,
     as_index,
 )
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     FLOAT_TYPES,
     NUMERIC_TYPES,
     OTHER_TYPES,
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 25a0694a4e5..3d6063d3419 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -10,7 +10,7 @@
 import cudf
 from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120
 from cudf.testing import utils
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     INTEGER_TYPES,
     assert_eq,
     assert_exceptions_equal,
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 680ce6ee597..fc193441113 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -4,7 +4,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 4461a38fcf9..9babe519817 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -7,7 +7,7 @@
 import cudf
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.dtypes import CategoricalDtype
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index ea85075d766..09ecb8a1efe 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -12,7 +12,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.testing.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
 
 
 def make_numeric_dataframe(nrows, dtype):
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 07ddf0028f2..d2e1f46416c 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index 86304fd3057..e9c828ec0f5 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -16,7 +16,7 @@
     RangeIndex,
     StringIndex,
 )
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)])
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 182fd1dc6ea..c8e5a9f071b 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -15,7 +15,7 @@
 import cudf
 from cudf.core.column import as_column
 from cudf.core.index import as_index
-from cudf.testing.utils import assert_eq, assert_exceptions_equal, assert_neq
+from cudf.testing._utils import assert_eq, assert_exceptions_equal, assert_neq
 
 
 def test_multiindex_levels_codes_validation():
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index ed23741b39c..3e014c98ea7 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_100
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def test_can_cast_safely_same_kind():
diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py
index 951af77b155..e5efe2f027d 100644
--- a/python/cudf/cudf/tests/test_numpy_interop.py
+++ b/python/cudf/cudf/tests/test_numpy_interop.py
@@ -2,7 +2,7 @@
 import pytest
 
 from cudf.core import DataFrame, Series
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def test_to_records_noindex():
diff --git a/python/cudf/cudf/tests/test_ops.py b/python/cudf/cudf/tests/test_ops.py
index b7228739cfa..ac3f784ecd4 100644
--- a/python/cudf/cudf/tests/test_ops.py
+++ b/python/cudf/cudf/tests/test_ops.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq, gen_rand
+from cudf.testing._utils import assert_eq, gen_rand
 
 
 def test_sqrt_float():
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index ff3c81a6cc1..8b323f269ff 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -13,7 +13,7 @@
 
 import cudf
 from cudf.io.orc import ORCWriter
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     assert_eq,
     gen_rand_series,
     supported_numpy_dtypes,
diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py
index ac1b39c9219..a8a45fc3c28 100644
--- a/python/cudf/cudf/tests/test_pandas_interop.py
+++ b/python/cudf/cudf/tests/test_pandas_interop.py
@@ -5,7 +5,7 @@
 
 import cudf
 from cudf.core import DataFrame
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def test_to_pandas():
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 526e5adfba9..26c230cab5f 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -18,7 +18,7 @@
 import cudf
 from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata
 from cudf.testing import dataset_generator as dg
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.fixture(scope="module")
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index 6ca55e625bf..596af1d2686 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -8,7 +8,7 @@
 
 from cudf.core import DataFrame, GenericIndex, Series
 from cudf.core.buffer import Buffer
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 if sys.version_info < (3, 8):
     try:
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index f72a8a5fc71..4055485c49a 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -1,7 +1,7 @@
 import pandas as pd
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def test_single_q():
diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
index 6e81f6f8457..8dc5df2dd7c 100644
--- a/python/cudf/cudf/tests/test_query.py
+++ b/python/cudf/cudf/tests/test_query.py
@@ -12,7 +12,7 @@
 
 import cudf
 from cudf.core import DataFrame
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 from cudf.utils import queryutils
 
 _params_query_parser = []
diff --git a/python/cudf/cudf/tests/test_query_mask.py b/python/cudf/cudf/tests/test_query_mask.py
index 08218a3bdbf..ab1c085c6c0 100644
--- a/python/cudf/cudf/tests/test_query_mask.py
+++ b/python/cudf/cudf/tests/test_query_mask.py
@@ -3,7 +3,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 _data = [
     {"a": [0, 1.0, 2.0, None, np.nan, None, 3, 5]},
diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py
index dfd8d4824cd..3c98496def3 100644
--- a/python/cudf/cudf/tests/test_rank.py
+++ b/python/cudf/cudf/tests/test_rank.py
@@ -7,7 +7,7 @@
 import pytest
 
 from cudf.core import DataFrame
-from cudf.testing.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
 @pytest.fixture
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 53b07a253ff..433f9d2f6ac 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -12,7 +12,7 @@
 import cudf
 from cudf.core import Series
 from cudf.testing import utils
-from cudf.testing.utils import NUMERIC_TYPES, gen_rand
+from cudf.testing._utils import NUMERIC_TYPES, gen_rand
 
 params_dtype = NUMERIC_TYPES
 
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 7c7dd948e13..c2da31098d4 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -8,7 +8,7 @@
 
 import cudf
 from cudf.core import DataFrame, Series
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index d25a2dd68ac..0c4313eb47c 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -9,7 +9,7 @@
 import cudf
 from cudf import melt as cudf_melt
 from cudf.core._compat import PANDAS_GE_120
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index 76e09eb5069..07e7f43c992 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -8,7 +8,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 300a4f6e917..133597b8f19 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -14,7 +14,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 moto = pytest.importorskip("moto", minversion="1.3.14")
 boto3 = pytest.importorskip("boto3")
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 6e8830fb207..05a826415a7 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -9,7 +9,7 @@
 import cudf
 from cudf import Scalar as pycudf_scalar
 from cudf._lib.copying import get_element
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py
index 33e9953c9ab..0f4cea5f812 100644
--- a/python/cudf/cudf/tests/test_scan.py
+++ b/python/cudf/cudf/tests/test_scan.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py
index 28da93d3401..c16c6486cd4 100644
--- a/python/cudf/cudf/tests/test_search.py
+++ b/python/cudf/cudf/tests/test_search.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq, gen_rand, random_bitmask
+from cudf.testing._utils import assert_eq, gen_rand, random_bitmask
 
 
 @pytest.mark.parametrize("side", ["left", "right"])
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index d76575eb8cc..9fba750bae2 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.testing import utils
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 5b9ecc98b40..8264017e905 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -9,7 +9,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,
diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py
index be08b3ba9a4..d4ef3ba235d 100644
--- a/python/cudf/cudf/tests/test_seriesmap.py
+++ b/python/cudf/cudf/tests/test_seriesmap.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf import Series
-from cudf.testing.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
 def test_series_map_basic():
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 9ea891dbda4..921a6b1556a 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_EQ_123, PANDAS_GE_120
-from cudf.testing.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
 @pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})])
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 5c8b278aaba..95942045654 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -9,7 +9,7 @@
 
 from cudf.core import DataFrame, Series
 from cudf.core.column import NumericalColumn
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py
index 23d149fe78d..50c8f3f41a8 100644
--- a/python/cudf/cudf/tests/test_sparse_df.py
+++ b/python/cudf/cudf/tests/test_sparse_df.py
@@ -8,7 +8,7 @@
 
 from cudf.comm.gpuarrow import GpuArrowReader
 from cudf.core import DataFrame, Series
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def read_data():
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index ffa605d5782..d4e944848c9 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.datasets import randomdata
-from cudf.testing.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 params_dtypes = [np.int32, np.uint32, np.float32, np.float64]
 methods = ["min", "max", "sum", "mean", "var", "std"]
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 6cc64a999aa..5fd3ffe43d5 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -17,7 +17,7 @@
 from cudf.core._compat import PANDAS_GE_110
 from cudf.core.column.string import StringColumn
 from cudf.core.index import StringIndex, as_index
-from cudf.testing.utils import (
+from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index abfe1f3a73c..e89db6fa138 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -4,7 +4,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
index a90c991937b..b2e5ea70ddc 100644
--- a/python/cudf/cudf/tests/test_testing.py
+++ b/python/cudf/cudf/tests/test_testing.py
@@ -10,7 +10,7 @@
     assert_index_equal,
     assert_series_equal,
 )
-from cudf.testing.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
 
 
 @pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]])
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index be01e6f7c48..79e9c68716e 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 def test_tokenize():
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 524731295fc..07a4564d2ba 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -12,7 +12,7 @@
 import cudf
 from cudf.core._compat import PANDAS_GE_120
 from cudf.testing import utils as utils
-from cudf.testing.utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 _TIMEDELTA_DATA = [
     [1000000, 200000, 3000000],
diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py
index 3e77f74d350..f9fadb15304 100644
--- a/python/cudf/cudf/tests/test_transform.py
+++ b/python/cudf/cudf/tests/test_transform.py
@@ -6,7 +6,7 @@
 import pytest
 
 from cudf.core import Series
-from cudf.testing.utils import NUMERIC_TYPES
+from cudf.testing._utils import NUMERIC_TYPES
 
 supported_types = NUMERIC_TYPES
 
diff --git a/python/custreamz/custreamz/tests/test_kafka.py b/python/custreamz/custreamz/tests/test_kafka.py
index 157b021a0b7..d29ebf8db8b 100644
--- a/python/custreamz/custreamz/tests/test_kafka.py
+++ b/python/custreamz/custreamz/tests/test_kafka.py
@@ -2,7 +2,7 @@
 import confluent_kafka as ck
 import pytest
 
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize("commit_offset", [-1, 0, 1, 1000])
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 0ede420ed2e..bfe4ca9d2e4 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -8,7 +8,7 @@
 import dask_cudf as dgd
 
 from cudf import DataFrame, Series
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 #############################################################################
 #                        Datetime Accessor                                  #
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 6091d0a5681..a3a15a5da38 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -706,7 +706,7 @@ def test_dataframe_set_index():
 
     pddf = dd.from_pandas(pdf, npartitions=4)
     pddf = pddf.set_index("str")
-    from cudf.testing.utils import assert_eq
+    from cudf.testing._utils import assert_eq
 
     assert_eq(ddf.compute(), pddf.compute())
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index 7e4adace212..f16bf8889ce 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -9,7 +9,7 @@
 import dask_cudf
 
 import cudf
-from cudf.testing.utils import assert_eq
+from cudf.testing._utils import assert_eq
 
 dask_cuda = pytest.importorskip("dask_cuda")
 

From 5ff10d599c2d00f700cf2fbf0bcc214416536942 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 4 Mar 2021 18:25:41 -0600
Subject: [PATCH 05/54] Apply suggestions from code review

---
 python/cudf/cudf/core/column/categorical.py | 1 -
 python/cudf/cudf/core/column/column.py      | 1 -
 2 files changed, 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index e0e56edb2f1..c41a458f02b 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -936,7 +936,6 @@ def ordered(self) -> Optional[bool]:
     def ordered(self, value: bool):
         self.dtype.ordered = value
 
-    # @property
     def cat(self, parent: ParentType = None):
         return CategoricalAccessor(self, parent=parent)
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 8e691773e3a..d6ea81d17f8 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -189,7 +189,6 @@ def __sizeof__(self) -> int:
             n += bitmask_allocation_size_bytes(self.size)
         return n
 
-    # @property
     def cat(
         self, parent=None
     ) -> "cudf.core.column.categorical.CategoricalAccessor":

From 5d18fb78389f1535e014daf744f18c182ad8c24a Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 11 Mar 2021 11:44:19 -0800
Subject: [PATCH 06/54] pyarrow 2.0

---
 conda/recipes/cudf/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index f8ecb711d9b..e213bc06062 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -27,7 +27,7 @@ requirements:
     - setuptools
     - numba >=0.49.0
     - dlpack
-    - pyarrow 3.0.0
+    - pyarrow 2.0.0
     - libcudf {{ version }}
     - rmm {{ minor_version }}
     - cudatoolkit {{ cuda_version }}

From 202130495d29715cc8f987b23d3e51436a3ba618 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 11 Mar 2021 11:44:57 -0800
Subject: [PATCH 07/54] pyarrow 2.0

---
 conda/recipes/libcudf/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 85aa0f08b48..01f429531e8 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -37,7 +37,7 @@ requirements:
   host:
     - librmm {{ minor_version }}.*
     - cudatoolkit {{ cuda_version }}.*
-    - arrow-cpp 3.0.0
+    - arrow-cpp 2.0.0
     - arrow-cpp-proc * cuda
     - boost-cpp 1.72.0
     - dlpack

From 3ff2c807799aea2abbae7092529134d407f35f24 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 16 Mar 2021 18:52:23 -0500
Subject: [PATCH 08/54] sync code-base

---
 python/cudf/cudf/tests/test_applymap.py   | 2 +-
 python/cudf/cudf/tests/test_binops.py     | 2 +-
 python/cudf/cudf/tests/test_dataframe.py  | 2 +-
 python/cudf/cudf/tests/test_indexing.py   | 2 +-
 python/cudf/cudf/tests/test_onehot.py     | 2 +-
 python/cudf/cudf/tests/test_reductions.py | 2 +-
 python/cudf/cudf/tests/test_repr.py       | 2 +-
 python/cudf/cudf/tests/test_serialize.py  | 2 +-
 python/cudf/cudf/tests/test_unaops.py     | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py
index eeacf05b33b..fa3c88a3551 100644
--- a/python/cudf/cudf/tests/test_applymap.py
+++ b/python/cudf/cudf/tests/test_applymap.py
@@ -7,7 +7,7 @@
 import pytest
 
 from cudf import Series
-from cudf.testing import utils
+from cudf.testing import _utils as utils
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 32f64a61894..5f8412585d1 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -14,7 +14,7 @@
 import cudf
 from cudf.core import Series
 from cudf.core.index import as_index
-from cudf.testing import utils
+from cudf.testing import _utils as utils
 from cudf.utils.dtypes import (
     BOOL_TYPES,
     DATETIME_TYPES,
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 07a00d510eb..a79b6c73b0f 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -20,7 +20,7 @@
 import cudf
 from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120
 from cudf.core.column import column
-from cudf.testing import utils
+from cudf.testing import _utils as utils
 from cudf.testing._utils import (
     ALL_TYPES,
     DATETIME_TYPES,
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 3a35d459d5f..1e67b5208a2 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120
-from cudf.testing import utils
+from cudf.testing import _utils as utils
 from cudf.testing._utils import (
     INTEGER_TYPES,
     assert_eq,
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index 1cc2d90e501..286ba852356 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -8,7 +8,7 @@
 
 import cudf
 from cudf.core import DataFrame, GenericIndex, Series
-from cudf.testing import utils
+from cudf.testing import _utils as utils
 
 
 def test_onehot_simple():
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 433f9d2f6ac..0d96cbee942 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -11,7 +11,7 @@
 
 import cudf
 from cudf.core import Series
-from cudf.testing import utils
+from cudf.testing import _utils as utils
 from cudf.testing._utils import NUMERIC_TYPES, gen_rand
 
 params_dtype = NUMERIC_TYPES
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index d9de9335889..30460ddee03 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -10,7 +10,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.testing import utils
+from cudf.testing import _utils as utils
 from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes
 
 repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index 9fba750bae2..49eefe19616 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.testing import utils
+from cudf.testing import _utils as utils
 from cudf.testing._utils import assert_eq
 
 
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index 61e437ef9e8..2089f764724 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -10,7 +10,7 @@
 
 import cudf
 from cudf.core import Series
-from cudf.testing import utils
+from cudf.testing import _utils as utils
 
 _unaops = [operator.abs, operator.invert, operator.neg, np.ceil, np.floor]
 

From 88794fe379024489c41572cd0f04c6f5b8f137e2 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 16 Mar 2021 19:10:20 -0500
Subject: [PATCH 09/54] fix imports

---
 python/cudf/cudf/tests/test_decimal.py   | 10 +++-------
 python/cudf/cudf/tests/test_timedelta.py |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index ddf56828c3d..ed2782c8c58 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -5,15 +5,11 @@
 import numpy as np
 import pyarrow as pa
 import pytest
-import cudf
 
-from cudf.core.dtypes import Decimal64Dtype
+import cudf
 from cudf.core.column import DecimalColumn, NumericalColumn
-
-from cudf.tests.utils import (
-    FLOAT_TYPES,
-    assert_eq,
-)
+from cudf.core.dtypes import Decimal64Dtype
+from cudf.testing._utils import FLOAT_TYPES, assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 07a4564d2ba..a65fdeeb0dd 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -11,7 +11,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_120
-from cudf.testing import utils as utils
+from cudf.testing import _utils as utils
 from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 _TIMEDELTA_DATA = [

From 90a860150f6a681ca2ddd0675922b0ff34162c8b Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 17 Mar 2021 08:26:38 -0700
Subject: [PATCH 10/54] change arrow version

---
 conda/recipes/libcudf/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 709194f0530..368c55141f4 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -37,7 +37,7 @@ requirements:
   host:
     - librmm {{ minor_version }}.*
     - cudatoolkit {{ cuda_version }}.*
-    - arrow-cpp 2.0.0
+    - arrow-cpp 3.0.0
     - arrow-cpp-proc * cuda
     - boost-cpp 1.72.0
     - dlpack

From 0545fdd2eba1de9d8cf0ffdda878aaf27f13e3ef Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 6 Apr 2021 21:01:38 -0700
Subject: [PATCH 11/54] sync changes

---
 python/cudf/cudf/comm/gpuarrow.py      | 29 +++++++++-------------
 python/cudf/cudf/testing/_utils.py     | 33 ++++++++++++++++++++++++++
 python/cudf/cudf/tests/test_binops.py  |  2 +-
 python/cudf/cudf/tests/test_joining.py | 32 +------------------------
 python/cudf/cudf/tests/test_string.py  |  2 +-
 5 files changed, 47 insertions(+), 51 deletions(-)

diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py
index 16ddb582605..451572224c6 100644
--- a/python/cudf/cudf/comm/gpuarrow.py
+++ b/python/cudf/cudf/comm/gpuarrow.py
@@ -1,5 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
-
+# Copyright (c) 2019-2020, NVIDIA CORPORATION.
 from collections import OrderedDict
 from collections.abc import Sequence
 
@@ -10,7 +9,7 @@
 from cudf._lib.gpuarrow import (
     CudaRecordBatchStreamReader as _CudaRecordBatchStreamReader,
 )
-from cudf.core import DataFrame, Series, column
+from cudf.core import Series, column
 from cudf.utils.utils import mask_bitsize, mask_dtype
 
 
@@ -34,25 +33,19 @@ def __init__(self, source, schema=None):
 
 class GpuArrowReader(Sequence):
     def __init__(self, schema, dev_ary):
-        table = CudaRecordBatchStreamReader(dev_ary, schema).read_all()
-        self._df = DataFrame.from_arrow(table)
-        self._schema = pa.Schema.from_pandas(self._df)
+        self._table = CudaRecordBatchStreamReader(dev_ary, schema).read_all()
 
     def __len__(self):
-        return len(self._df._data.names)
+        return self._table.num_columns
 
     def __getitem__(self, idx):
-        return GpuArrowNodeReader(
-            schema=self._schema,
-            field=self._schema[idx],
-            series=self._df._data.columns[idx],
-        )
+        return GpuArrowNodeReader(self._table, idx)
 
     def schema(self):
         """
         Return a pyarrow schema
         """
-        return self._schema
+        return self._table.schema
 
     def to_dict(self):
         """
@@ -65,10 +58,10 @@ def to_dict(self):
 
 
 class GpuArrowNodeReader(object):
-    def __init__(self, schema, field, series):
-        self._schema = schema
-        self._field = field
-        self._series = Series(column.as_column(series))
+    def __init__(self, table, index):
+        self._table = table
+        self._field = table.schema[index]
+        self._series = Series(column.as_column(table.column(index)))
         self._series.name = self.name
 
     def __len__(self):
@@ -76,7 +69,7 @@ def __len__(self):
 
     @property
     def schema(self):
-        return self._schema
+        return self._table.schema
 
     @property
     def field_schema(self):
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 37a74ab4760..055535d2215 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -15,6 +15,8 @@
 from cudf.core.column.datetime import _numpy_to_pandas_conversion
 from cudf.utils import dtypes as dtypeutils
 
+_JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi")
+
 supported_numpy_dtypes = [
     "bool",
     "int8",
@@ -299,3 +301,34 @@ def gen_rand_series(dtype, size, **kwargs):
 @contextmanager
 def does_not_raise():
     yield
+
+
+def assert_join_results_equal(expect, got, how, **kwargs):
+    if how not in _JOIN_TYPES:
+        raise ValueError(f"Unrecognized join type {how}")
+    if how == "right":
+        got = got[expect.columns]
+
+    if isinstance(expect, (pd.Series, cudf.Series)):
+        return assert_eq(
+            expect.sort_values().reset_index(drop=True),
+            got.sort_values().reset_index(drop=True),
+            **kwargs,
+        )
+    elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)):
+        if not len(
+            expect.columns
+        ):  # can't sort_values() on a df without columns
+            return assert_eq(expect, got, **kwargs)
+
+        assert_eq(
+            expect.sort_values(expect.columns.to_list()).reset_index(
+                drop=True
+            ),
+            got.sort_values(got.columns.to_list()).reset_index(drop=True),
+            **kwargs,
+        )
+    elif isinstance(expect, (pd.Index, cudf.Index)):
+        return assert_eq(expect.sort_values(), got.sort_values(), **kwargs)
+    else:
+        raise ValueError(f"Not a join result: {type(expect).__name__}")
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 7a6fa526402..f205eae68e5 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -2037,7 +2037,7 @@ def test_binops_decimal(args):
         ),
     ],
 )
-@pytest.mark.parametrize("integer_dtype", cudf.tests.utils.INTEGER_TYPES)
+@pytest.mark.parametrize("integer_dtype", utils.INTEGER_TYPES)
 @pytest.mark.parametrize("reflected", [True, False])
 def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
     """
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index d1e748cf77b..73b71bee77d 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -12,6 +12,7 @@
     NUMERIC_TYPES,
     assert_eq,
     assert_exceptions_equal,
+    assert_join_results_equal,
 )
 
 _JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi")
@@ -71,37 +72,6 @@ def pd_odd_joins(left, right, join_type):
         return left[left.index.isin(right.index)][left.columns]
 
 
-def assert_join_results_equal(expect, got, how, **kwargs):
-    if how not in _JOIN_TYPES:
-        raise ValueError(f"Unrecognized join type {how}")
-    if how == "right":
-        got = got[expect.columns]
-
-    if isinstance(expect, (pd.Series, cudf.Series)):
-        return assert_eq(
-            expect.sort_values().reset_index(drop=True),
-            got.sort_values().reset_index(drop=True),
-            **kwargs,
-        )
-    elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)):
-        if not len(
-            expect.columns
-        ):  # can't sort_values() on a df without columns
-            return assert_eq(expect, got, **kwargs)
-
-        assert_eq(
-            expect.sort_values(expect.columns.to_list()).reset_index(
-                drop=True
-            ),
-            got.sort_values(got.columns.to_list()).reset_index(drop=True),
-            **kwargs,
-        )
-    elif isinstance(expect, (pd.Index, cudf.Index)):
-        return assert_eq(expect.sort_values(), got.sort_values(), **kwargs)
-    else:
-        raise ValueError(f"Not a join result: {type(expect).__name__}")
-
-
 @pytest.mark.parametrize("aa,bb,how,method", make_params())
 def test_dataframe_join_how(aa, bb, how, method):
     df = cudf.DataFrame()
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 668247a027d..32cf2592409 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -22,8 +22,8 @@
     NUMERIC_TYPES,
     assert_eq,
     assert_exceptions_equal,
+    assert_join_results_equal,
 )
-from cudf.tests.test_joining import assert_join_results_equal
 from cudf.utils import dtypes as dtypeutils
 
 data_list = [

From baeabe97375bb5434df44ef61358a65ddaea7bc8 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 6 Apr 2021 21:03:54 -0700
Subject: [PATCH 12/54] bump up the arrow version

---
 conda/environments/cudf_dev_cuda10.1.yml | 4 ++--
 conda/environments/cudf_dev_cuda10.2.yml | 4 ++--
 conda/environments/cudf_dev_cuda11.0.yml | 4 ++--
 conda/recipes/libcudf/meta.yaml          | 2 +-
 cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
index bbf63dd46e4..1fb1637e357 100644
--- a/conda/environments/cudf_dev_cuda10.1.yml
+++ b/conda/environments/cudf_dev_cuda10.1.yml
@@ -18,7 +18,7 @@ dependencies:
   - numba>=0.49.0,!=0.51.0
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=3.0.0
+  - pyarrow=4.0.0
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -46,7 +46,7 @@ dependencies:
   - distributed>=2.22.0,<=2021.3.1
   - streamz
   - dlpack
-  - arrow-cpp=3.0.0
+  - arrow-cpp=4.0.0
   - arrow-cpp-proc * cuda
   - boost-cpp>=1.72.0
   - double-conversion
diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
index 6a598ed4d37..a31d18ce254 100644
--- a/conda/environments/cudf_dev_cuda10.2.yml
+++ b/conda/environments/cudf_dev_cuda10.2.yml
@@ -18,7 +18,7 @@ dependencies:
   - numba>=0.49,!=0.51.0
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=3.0.0
+  - pyarrow=4.0.0
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -46,7 +46,7 @@ dependencies:
   - distributed>=2.22.0,<=2021.3.1
   - streamz
   - dlpack
-  - arrow-cpp=3.0.0
+  - arrow-cpp=4.0.0
   - arrow-cpp-proc * cuda
   - boost-cpp>=1.72.0
   - double-conversion
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 2bd3b70c617..ad24311b952 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -18,7 +18,7 @@ dependencies:
   - numba>=0.49,!=0.51.0
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=3.0.0
+  - pyarrow=4.0.0
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -46,7 +46,7 @@ dependencies:
   - distributed>=2.22.0,<=2021.3.1
   - streamz
   - dlpack
-  - arrow-cpp=3.0.0
+  - arrow-cpp=4.0.0
   - arrow-cpp-proc * cuda
   - boost-cpp>=1.72.0
   - double-conversion
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 7128c4a2f78..c3344ab7750 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -37,7 +37,7 @@ requirements:
   host:
     - librmm {{ minor_version }}.*
     - cudatoolkit {{ cuda_version }}.*
-    - arrow-cpp 3.0.0
+    - arrow-cpp 4.0.0
     - arrow-cpp-proc * cuda
     - boost-cpp 1.72.0
     - dlpack
diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
index 752b84ea78b..1224c8bf728 100644
--- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
@@ -120,6 +120,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC)
 
 endfunction()
 
-set(CUDF_VERSION_Arrow 3.0.0)
+set(CUDF_VERSION_Arrow 4.0.0)
 
 find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC})

From 9984297a4632824f0931fae5c69a3b8f14b140a7 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 6 Apr 2021 21:07:45 -0700
Subject: [PATCH 13/54] remove xfail

---
 python/cudf/cudf/tests/test_gpu_arrow_parser.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
index baf2fa62e38..a088ae9f923 100644
--- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py
+++ b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
@@ -179,7 +179,6 @@ def make_gpu_parse_arrow_cats_batch():
 
 
 def test_gpu_parse_arrow_cats():
-    pytest.xfail(reason="need dictionary mapping in libcudf from_arrow")
     batch = make_gpu_parse_arrow_cats_batch()
 
     stream = pa.BufferOutputStream()

From 707dcc0c987d5c46d93ab4ffa9e5357519f704b9 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 29 Apr 2021 09:47:48 -0700
Subject: [PATCH 14/54] update 11.2 yml

---
 conda/environments/cudf_dev_cuda11.2.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 24f7f7a1144..5debe8d86c0 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -18,7 +18,7 @@ dependencies:
   - numba>=0.53.1
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=1.0.1
+  - pyarrow=4.0.0
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -46,7 +46,7 @@ dependencies:
   - distributed>=2.22.0,<=2021.4.0
   - streamz
   - dlpack
-  - arrow-cpp=1.0.1
+  - arrow-cpp=4.0.0
   - arrow-cpp-proc * cuda
   - boost-cpp>=1.72.0
   - double-conversion

From 7005ed01d30c34e3c853eeda411aff0c9f6b24c0 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 1 Jun 2021 07:44:15 -0700
Subject: [PATCH 15/54] bump arrow patch version

---
 conda/environments/cudf_dev_cuda11.0.yml | 2 +-
 conda/environments/cudf_dev_cuda11.2.yml | 2 +-
 conda/recipes/libcudf/meta.yaml          | 2 +-
 cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 2f4564a845e..4ddda14703c 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -17,7 +17,7 @@ dependencies:
   - numba>=0.53.1
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=4.0.0
+  - pyarrow=4.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 59d3f33fa96..04cd8225931 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -17,7 +17,7 @@ dependencies:
   - numba>=0.53.1
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=4.0.0
+  - pyarrow=4.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index d12f0e1b5c3..17d9b05ea4f 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -37,7 +37,7 @@ requirements:
   host:
     - librmm {{ minor_version }}.*
     - cudatoolkit {{ cuda_version }}.*
-    - arrow-cpp 4.0.0
+    - arrow-cpp 4.0.1
     - arrow-cpp-proc * cuda
     - dlpack>=0.5,<0.6.0a0
   run:
diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
index 3866c6ab64d..b9b6ab18d1f 100644
--- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
@@ -121,6 +121,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC)
 
 endfunction()
 
-set(CUDF_VERSION_Arrow 4.0.0)
+set(CUDF_VERSION_Arrow 4.0.1)
 
 find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC})

From 290bc16588d1e5e8cd963ee7abf5241bf875058a Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 1 Jun 2021 08:37:18 -0700
Subject: [PATCH 16/54] remove stale code

---
 python/cudf/cudf/core/column/column.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index f6283b18150..a58b2eda822 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -63,7 +63,6 @@
 from cudf.utils.utils import mask_dtype
 
 T = TypeVar("T", bound="ColumnBase")
-ParentType = Union["cudf.Series", "cudf.Index"]
 
 
 class ColumnBase(Column, Serializable):

From 6cd394dd6b11fdd61ebc0f742c03298a529d85be Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 1 Jun 2021 08:46:39 -0700
Subject: [PATCH 17/54] temporary change

---
 ci/gpu/build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index e1ddfa1cc56..ed247cb637e 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -85,6 +85,7 @@ gpuci_conda_retry install -y \
 # https://docs.rapids.ai/maintainers/depmgmt/
 # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
+gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1"
 
 
 gpuci_logger "Check compiler versions"

From cab9539c4972f79fe82ebbc08e72bb40fb71d6c8 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 1 Jun 2021 09:33:37 -0700
Subject: [PATCH 18/54] remove to_pandas()

---
 python/cudf/cudf/_lib/utils.pyx    | 3 ---
 python/cudf/cudf/core/dataframe.py | 7 +------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 199d6fb32a4..449d01357b0 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -139,9 +139,6 @@ cpdef generate_pandas_metadata(Table table, index):
 
     metadata = pa.pandas_compat.construct_metadata(
         columns_to_convert=[
-            col.to_pandas()
-            if isinstance(col, cudf.core.column.CategoricalColumn)
-            else
             col
             for col in table._data.columns
         ],
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index eb42166aaa5..c21a807edb0 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5666,12 +5666,7 @@ def to_arrow(self, preserve_index=True):
 
         out = super(DataFrame, data).to_arrow()
         metadata = pa.pandas_compat.construct_metadata(
-            columns_to_convert=[
-                col.to_pandas()
-                if isinstance(col, cudf.core.column.CategoricalColumn)
-                else col
-                for col in self._data.columns
-            ],
+            columns_to_convert=[self[col] for col in self._data.names],
             df=self,
             column_names=out.schema.names,
             index_levels=[self.index],

From 173d7947ee9373a58880b33c937917d5c6e19d7e Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 1 Jun 2021 09:36:52 -0700
Subject: [PATCH 19/54] imports

---
 python/dask_cudf/dask_cudf/tests/test_core.py        | 4 ++--
 python/dask_cudf/dask_cudf/tests/test_distributed.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 0110e5e38d5..07920863186 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -18,10 +18,10 @@
 
 from dask.utils import M
 
-import dask_cudf as dgd
-
 import cudf
 
+import dask_cudf as dgd
+
 
 def test_from_cudf():
     np.random.seed(0)
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index b7dc17f08b6..876a66f78d7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -6,11 +6,11 @@
 from dask.distributed import Client
 from distributed.utils_test import loop  # noqa: F401
 
-import dask_cudf
-
 import cudf
 from cudf.testing._utils import assert_eq
 
+import dask_cudf
+
 dask_cuda = pytest.importorskip("dask_cuda")
 
 

From b9a90bf4753f93247c0d77c7674174543364d2a1 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 8 Jun 2021 13:10:54 -0500
Subject: [PATCH 20/54] temp commit

---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index ed247cb637e..66cc0e307ad 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -83,7 +83,7 @@ gpuci_conda_retry install -y \
                   "ucx-py=${MINOR_VERSION}"
 
 # https://docs.rapids.ai/maintainers/depmgmt/
-# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
+gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
 gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1"
 

From 0fb2c6d240d3c267850cec2d96b36257c53b510e Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 14 Jun 2021 14:29:37 -0500
Subject: [PATCH 21/54] style

---
 python/cudf/cudf/tests/test_categorical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index ea09670a662..1c384b57257 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.testing._utils import NUMERIC_TYPES, assert_eq, assert_exceptions_equal
+from cudf.testing._utils import NUMERIC_TYPES, assert_exceptions_equal, assert_eq
 
 
 @pytest.fixture

From 6046433fe702128dd1cd8488c41ab726391320c4 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 14 Jun 2021 14:35:36 -0500
Subject: [PATCH 22/54] style

---
 python/cudf/cudf/tests/test_categorical.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 1c384b57257..dc9610176c9 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -9,7 +9,11 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.testing._utils import NUMERIC_TYPES, assert_exceptions_equal, assert_eq
+from cudf.testing._utils import (
+    NUMERIC_TYPES,
+    assert_eq,
+    assert_exceptions_equal,
+)
 
 
 @pytest.fixture

From 995b4a529c0341819eb7bbf3d133a1525493bffe Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 15 Jun 2021 14:07:53 -0500
Subject: [PATCH 23/54] temp commit

---
 ci/gpu/build.sh                    | 2 +-
 python/cudf/cudf/tests/test_cut.py | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 54a8e341c8b..6da273a08de 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -169,7 +169,7 @@ else
     for gt in gtests/* ; do
         test_name=$(basename ${gt})
         echo "Running GoogleTest $test_name"
-        ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
+        ${gt}
     done
 
     CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"`
diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/test_cut.py
index 926826ac188..710df78e36b 100644
--- a/python/cudf/cudf/tests/test_cut.py
+++ b/python/cudf/cudf/tests/test_cut.py
@@ -4,11 +4,12 @@
 Test related to Cut
 """
 
-import pandas as pd
 import numpy as np
-from cudf.core.cut import cut
+import pandas as pd
 import pytest
-from cudf.tests.utils import assert_eq
+
+from cudf.core.cut import cut
+from cudf.testing._utils import assert_eq
 
 
 @pytest.mark.parametrize(

From 7a199ad5d8a8b24c58da1c619eeccb76c74a24ee Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Wed, 16 Jun 2021 18:46:56 -0500
Subject: [PATCH 24/54] flush after writing second JSON line

---
 cpp/tests/io/arrow_io_source_test.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
index 24964db5f8c..72ddb87f19b 100644
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ b/cpp/tests/io/arrow_io_source_test.cpp
@@ -43,7 +43,8 @@ TEST_F(ArrowIOTest, URIFileSystem)
 {
   const std::string file_name = temp_env->get_temp_dir() + "JsonLinesFileTest.json";
   std::ofstream outfile(file_name, std::ofstream::out);
-  outfile << "[11, 1.1]\n[22, 2.2]";
+  outfile << "[11, 1.1]" << std::endl;
+  outfile << "[22, 2.2]" << std::endl;
   outfile.close();
 
   std::string file_uri = "file://" + file_name;

From 79a364e785578cd568a867767b3565620fb9fd51 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 17 Jun 2021 08:25:45 -0700
Subject: [PATCH 25/54] add back xml output

---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 8a1ee122dfc..d63ec64faf2 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -171,7 +171,7 @@ else
     for gt in gtests/* ; do
         test_name=$(basename ${gt})
         echo "Running GoogleTest $test_name"
-        ${gt}
+        ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
     done
 
     CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"`

From c9c9a153709d96658eeecc6cc3c5af756d8c98e3 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 17 Jun 2021 15:39:22 -0700
Subject: [PATCH 26/54] temp commit

---
 cpp/tests/io/arrow_io_source_test.cpp | 35 ++++++++++++++++-----------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
index 72ddb87f19b..3fdb3988194 100644
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ b/cpp/tests/io/arrow_io_source_test.cpp
@@ -46,20 +46,27 @@ TEST_F(ArrowIOTest, URIFileSystem)
   outfile << "[11, 1.1]" << std::endl;
   outfile << "[22, 2.2]" << std::endl;
   outfile.close();
-
-  std::string file_uri = "file://" + file_name;
-  std::unique_ptr<cudf::io::arrow_io_source> datasource =
-    std::make_unique<cudf::io::arrow_io_source>(file_uri);
-
-  // Populate the JSON Reader Options
-  cudf::io::json_reader_options options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info(datasource.get())).lines(true);
-
-  // Read the JSON file from the LocalFileSystem
-  cudf::io::table_with_metadata tbl = cudf::io::read_json(options);
-
-  ASSERT_EQ(2, tbl.tbl->num_columns());
-  ASSERT_EQ(2, tbl.tbl->num_rows());
+  std::string line;
+  std::ifstream myfile(file_name);
+  if (myfile.is_open()) {
+    while (getline(myfile, line)) { std::cout << line << '\n'; }
+    myfile.close();
+  } else
+    std::cout << "Unable to open file";
+
+  // std::string file_uri = "file://" + file_name;
+  // std::unique_ptr<cudf::io::arrow_io_source> datasource =
+  //   std::make_unique<cudf::io::arrow_io_source>(file_uri);
+
+  // // Populate the JSON Reader Options
+  // cudf::io::json_reader_options options =
+  //   cudf::io::json_reader_options::builder(cudf::io::source_info(datasource.get())).lines(true);
+
+  // // Read the JSON file from the LocalFileSystem
+  // cudf::io::table_with_metadata tbl = cudf::io::read_json(options);
+
+  // ASSERT_EQ(2, tbl.tbl->num_columns());
+  // ASSERT_EQ(2, tbl.tbl->num_rows());
 }
 
 #ifdef S3_ENABLED

From 47e3abf2cebf4118679bd3f64b0bb80488fd3c32 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 17 Jun 2021 20:03:35 -0700
Subject: [PATCH 27/54] disable tests

---
 cpp/tests/io/arrow_io_source_test.cpp | 36 +++++++++++++--------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
index 3fdb3988194..b3b26062dce 100644
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ b/cpp/tests/io/arrow_io_source_test.cpp
@@ -73,30 +73,30 @@ TEST_F(ArrowIOTest, URIFileSystem)
 
 TEST_F(ArrowIOTest, S3FileSystem)
 {
-  std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2";
-  std::unique_ptr<cudf::io::arrow_io_source> datasource =
-    std::make_unique<cudf::io::arrow_io_source>(s3_uri);
-
-  // Populate the Parquet Reader Options
-  cudf::io::source_info src(datasource.get());
-  std::vector<std::string> single_column;
-  single_column.insert(single_column.begin(), "total_bill");
-  cudf::io::parquet_reader_options_builder builder(src);
-  cudf::io::parquet_reader_options options = builder.columns(single_column).build();
-
-  // Read the Parquet file from S3
-  cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options);
-
-  ASSERT_EQ(1, tbl.tbl->num_columns());  // Only single column specified in reader_options
-  ASSERT_EQ(244, tbl.tbl->num_rows());   // known number of rows from the S3 file
+  // std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2";
+  // std::unique_ptr<cudf::io::arrow_io_source> datasource =
+  //   std::make_unique<cudf::io::arrow_io_source>(s3_uri);
+
+  // // Populate the Parquet Reader Options
+  // cudf::io::source_info src(datasource.get());
+  // std::vector<std::string> single_column;
+  // single_column.insert(single_column.begin(), "total_bill");
+  // cudf::io::parquet_reader_options_builder builder(src);
+  // cudf::io::parquet_reader_options options = builder.columns(single_column).build();
+
+  // // Read the Parquet file from S3
+  // cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options);
+
+  // ASSERT_EQ(1, tbl.tbl->num_columns());  // Only single column specified in reader_options
+  // ASSERT_EQ(244, tbl.tbl->num_rows());   // known number of rows from the S3 file
 }
 
 #else
 
 TEST_F(ArrowIOTest, S3URIWhenNotEnabled)
 {
-  std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2";
-  EXPECT_THROW(std::make_unique<cudf::io::arrow_io_source>(s3_uri), cudf::logic_error);
+  // std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2";
+  // EXPECT_THROW(std::make_unique<cudf::io::arrow_io_source>(s3_uri), cudf::logic_error);
 }
 
 #endif

From 4775068627b39d7aa274712a466be9c67430e77b Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 18 Jun 2021 06:38:19 -0700
Subject: [PATCH 28/54] disable arrow test

---
 cpp/tests/io/csv_test.cpp | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index f2278267f74..a2835e576c4 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1060,25 +1060,25 @@ TEST_F(CsvReaderTest, HeaderOnlyFile)
 
 TEST_F(CsvReaderTest, ArrowFileSource)
 {
-  auto filepath = temp_env->get_temp_dir() + "ArrowFileSource.csv";
-  {
-    std::ofstream outfile(filepath, std::ofstream::out);
-    outfile << "A\n9\n8\n7\n6\n5\n4\n3\n2\n";
-  }
+  // auto filepath = temp_env->get_temp_dir() + "ArrowFileSource.csv";
+  // {
+  //   std::ofstream outfile(filepath, std::ofstream::out);
+  //   outfile << "A\n9\n8\n7\n6\n5\n4\n3\n2\n";
+  // }
 
-  std::shared_ptr<arrow::io::ReadableFile> infile;
-  ASSERT_TRUE(arrow::io::ReadableFile::Open(filepath).Value(&infile).ok());
+  // std::shared_ptr<arrow::io::ReadableFile> infile;
+  // ASSERT_TRUE(arrow::io::ReadableFile::Open(filepath).Value(&infile).ok());
 
-  auto arrow_source = cudf_io::arrow_io_source{infile};
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source}).dtypes({"int8"});
-  auto result = cudf_io::read_csv(in_opts);
+  // auto arrow_source = cudf_io::arrow_io_source{infile};
+  // cudf_io::csv_reader_options in_opts =
+  //   cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source}).dtypes({"int8"});
+  // auto result = cudf_io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
-  EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id());
+  // const auto view = result.tbl->view();
+  // EXPECT_EQ(1, view.num_columns());
+  // ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id());
 
-  expect_column_data_equal(std::vector<int8_t>{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0));
+  // expect_column_data_equal(std::vector<int8_t>{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0));
 }
 
 TEST_F(CsvReaderTest, InvalidFloatingPoint)

From 626b9159bda7d18bf69a79cd6f3a30a654ad4056 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 22 Jun 2021 12:41:30 -0700
Subject: [PATCH 29/54] add arrow 4.0.1 in cpu builds

---
 ci/cpu/build.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index e11a0488624..c721c7f2a7f 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -41,6 +41,9 @@ env
 gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
+gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
+# gpuci_conda_retry install -y "your-pkg=1.0.0"
+gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1"
 
 # Remove rapidsai-nightly channel if we are building main branch
 if [ "$SOURCE_BRANCH" = "main" ]; then

From a7b631e105f28d79f4b49438bde435b5bf622000 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 22 Jun 2021 13:09:03 -0700
Subject: [PATCH 30/54] conda install

---
 ci/cpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index c721c7f2a7f..1a335e0797b 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -41,7 +41,7 @@ env
 gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
-gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
+gpuci_conda_retry remove --force arrow-cpp pyarrow "arrow-cpp-proc * cuda"
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
 gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1"
 

From 951df947aef6971e28d8e07122b8b36e22d110c9 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 22 Jun 2021 13:22:22 -0700
Subject: [PATCH 31/54] gpu packages

---
 ci/cpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 1a335e0797b..98782ef39f7 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -43,7 +43,7 @@ gpuci_logger "Activate conda env"
 conda activate rapids
 gpuci_conda_retry remove --force arrow-cpp pyarrow "arrow-cpp-proc * cuda"
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
-gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1"
+gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda"
 
 # Remove rapidsai-nightly channel if we are building main branch
 if [ "$SOURCE_BRANCH" = "main" ]; then

From 0c3570faef553bebfb1dc6f0c3b2b320b2a7c33b Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 22 Jun 2021 14:10:17 -0700
Subject: [PATCH 32/54] -y

---
 ci/cpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 98782ef39f7..d9caf04a92e 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -41,7 +41,7 @@ env
 gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
-gpuci_conda_retry remove --force arrow-cpp pyarrow "arrow-cpp-proc * cuda"
+gpuci_conda_retry remove -y --force arrow-cpp pyarrow "arrow-cpp-proc * cuda"
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
 gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda"
 

From 6eecdc713510c104af25b0f81667a8075206bac4 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 22 Jun 2021 14:19:31 -0700
Subject: [PATCH 33/54] use conda

---
 ci/cpu/build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index d9caf04a92e..6c29e2c9381 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -41,9 +41,9 @@ env
 gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
-gpuci_conda_retry remove -y --force arrow-cpp pyarrow "arrow-cpp-proc * cuda"
+conda remove -y --force arrow-cpp pyarrow "arrow-cpp-proc * cuda"
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
-gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda"
+conda install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda"
 
 # Remove rapidsai-nightly channel if we are building main branch
 if [ "$SOURCE_BRANCH" = "main" ]; then

From 038349f43a0254331686bb5d115e1996edb1cae0 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 22 Jun 2021 14:24:39 -0700
Subject: [PATCH 34/54] fix

---
 ci/cpu/build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 6c29e2c9381..1dc18078fe5 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -41,9 +41,9 @@ env
 gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
-conda remove -y --force arrow-cpp pyarrow "arrow-cpp-proc * cuda"
+gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda'
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
-conda install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda"
+gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda"
 
 # Remove rapidsai-nightly channel if we are building main branch
 if [ "$SOURCE_BRANCH" = "main" ]; then

From 9f06543d1dcf52c24cc961cf6b7cc83ee3769039 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 22 Jun 2021 14:25:37 -0700
Subject: [PATCH 35/54] fix

---
 ci/cpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 1dc18078fe5..c94588b91c7 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -43,7 +43,7 @@ gpuci_logger "Activate conda env"
 conda activate rapids
 gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda'
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
-gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1 arrow-cpp-proc * cuda"
+gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda'
 
 # Remove rapidsai-nightly channel if we are building main branch
 if [ "$SOURCE_BRANCH" = "main" ]; then

From 18dac174dbfe0c606b72c140b9849ca53b686518 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 22 Jun 2021 16:28:34 -0500
Subject: [PATCH 36/54] Apply suggestions from code review

Co-authored-by: jakirkham <jakirkham@gmail.com>
---
 ci/cpu/build.sh | 2 +-
 ci/gpu/build.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index c94588b91c7..2c6cecf667b 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -43,7 +43,7 @@ gpuci_logger "Activate conda env"
 conda activate rapids
 gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda'
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
-gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda'
+gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda'
 
 # Remove rapidsai-nightly channel if we are building main branch
 if [ "$SOURCE_BRANCH" = "main" ]; then
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 68e239544f3..22cf6e67827 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -85,7 +85,7 @@ gpuci_conda_retry install -y \
 # https://docs.rapids.ai/maintainers/depmgmt/
 gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
-gpuci_conda_retry install -y "arrow-cpp=4.0.1 pyarrow=4.0.1"
+gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda'
 
 
 gpuci_logger "Check compiler versions"

From 2cf75fbb0648e711c6e977bb7c5703fa10261d86 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 22 Jun 2021 16:32:38 -0700
Subject: [PATCH 37/54] revert disabling of arrow tests

---
 cpp/tests/io/arrow_io_source_test.cpp | 74 ++++++++++++---------------
 cpp/tests/io/csv_test.cpp             | 30 +++++------
 2 files changed, 48 insertions(+), 56 deletions(-)

diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
index b3b26062dce..24964db5f8c 100644
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ b/cpp/tests/io/arrow_io_source_test.cpp
@@ -43,60 +43,52 @@ TEST_F(ArrowIOTest, URIFileSystem)
 {
   const std::string file_name = temp_env->get_temp_dir() + "JsonLinesFileTest.json";
   std::ofstream outfile(file_name, std::ofstream::out);
-  outfile << "[11, 1.1]" << std::endl;
-  outfile << "[22, 2.2]" << std::endl;
+  outfile << "[11, 1.1]\n[22, 2.2]";
   outfile.close();
-  std::string line;
-  std::ifstream myfile(file_name);
-  if (myfile.is_open()) {
-    while (getline(myfile, line)) { std::cout << line << '\n'; }
-    myfile.close();
-  } else
-    std::cout << "Unable to open file";
-
-  // std::string file_uri = "file://" + file_name;
-  // std::unique_ptr<cudf::io::arrow_io_source> datasource =
-  //   std::make_unique<cudf::io::arrow_io_source>(file_uri);
-
-  // // Populate the JSON Reader Options
-  // cudf::io::json_reader_options options =
-  //   cudf::io::json_reader_options::builder(cudf::io::source_info(datasource.get())).lines(true);
-
-  // // Read the JSON file from the LocalFileSystem
-  // cudf::io::table_with_metadata tbl = cudf::io::read_json(options);
-
-  // ASSERT_EQ(2, tbl.tbl->num_columns());
-  // ASSERT_EQ(2, tbl.tbl->num_rows());
+
+  std::string file_uri = "file://" + file_name;
+  std::unique_ptr<cudf::io::arrow_io_source> datasource =
+    std::make_unique<cudf::io::arrow_io_source>(file_uri);
+
+  // Populate the JSON Reader Options
+  cudf::io::json_reader_options options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info(datasource.get())).lines(true);
+
+  // Read the JSON file from the LocalFileSystem
+  cudf::io::table_with_metadata tbl = cudf::io::read_json(options);
+
+  ASSERT_EQ(2, tbl.tbl->num_columns());
+  ASSERT_EQ(2, tbl.tbl->num_rows());
 }
 
 #ifdef S3_ENABLED
 
 TEST_F(ArrowIOTest, S3FileSystem)
 {
-  // std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2";
-  // std::unique_ptr<cudf::io::arrow_io_source> datasource =
-  //   std::make_unique<cudf::io::arrow_io_source>(s3_uri);
-
-  // // Populate the Parquet Reader Options
-  // cudf::io::source_info src(datasource.get());
-  // std::vector<std::string> single_column;
-  // single_column.insert(single_column.begin(), "total_bill");
-  // cudf::io::parquet_reader_options_builder builder(src);
-  // cudf::io::parquet_reader_options options = builder.columns(single_column).build();
-
-  // // Read the Parquet file from S3
-  // cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options);
-
-  // ASSERT_EQ(1, tbl.tbl->num_columns());  // Only single column specified in reader_options
-  // ASSERT_EQ(244, tbl.tbl->num_rows());   // known number of rows from the S3 file
+  std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2";
+  std::unique_ptr<cudf::io::arrow_io_source> datasource =
+    std::make_unique<cudf::io::arrow_io_source>(s3_uri);
+
+  // Populate the Parquet Reader Options
+  cudf::io::source_info src(datasource.get());
+  std::vector<std::string> single_column;
+  single_column.insert(single_column.begin(), "total_bill");
+  cudf::io::parquet_reader_options_builder builder(src);
+  cudf::io::parquet_reader_options options = builder.columns(single_column).build();
+
+  // Read the Parquet file from S3
+  cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options);
+
+  ASSERT_EQ(1, tbl.tbl->num_columns());  // Only single column specified in reader_options
+  ASSERT_EQ(244, tbl.tbl->num_rows());   // known number of rows from the S3 file
 }
 
 #else
 
 TEST_F(ArrowIOTest, S3URIWhenNotEnabled)
 {
-  // std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2";
-  // EXPECT_THROW(std::make_unique<cudf::io::arrow_io_source>(s3_uri), cudf::logic_error);
+  std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2";
+  EXPECT_THROW(std::make_unique<cudf::io::arrow_io_source>(s3_uri), cudf::logic_error);
 }
 
 #endif
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index a2835e576c4..f2278267f74 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1060,25 +1060,25 @@ TEST_F(CsvReaderTest, HeaderOnlyFile)
 
 TEST_F(CsvReaderTest, ArrowFileSource)
 {
-  // auto filepath = temp_env->get_temp_dir() + "ArrowFileSource.csv";
-  // {
-  //   std::ofstream outfile(filepath, std::ofstream::out);
-  //   outfile << "A\n9\n8\n7\n6\n5\n4\n3\n2\n";
-  // }
+  auto filepath = temp_env->get_temp_dir() + "ArrowFileSource.csv";
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    outfile << "A\n9\n8\n7\n6\n5\n4\n3\n2\n";
+  }
 
-  // std::shared_ptr<arrow::io::ReadableFile> infile;
-  // ASSERT_TRUE(arrow::io::ReadableFile::Open(filepath).Value(&infile).ok());
+  std::shared_ptr<arrow::io::ReadableFile> infile;
+  ASSERT_TRUE(arrow::io::ReadableFile::Open(filepath).Value(&infile).ok());
 
-  // auto arrow_source = cudf_io::arrow_io_source{infile};
-  // cudf_io::csv_reader_options in_opts =
-  //   cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source}).dtypes({"int8"});
-  // auto result = cudf_io::read_csv(in_opts);
+  auto arrow_source = cudf_io::arrow_io_source{infile};
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source}).dtypes({"int8"});
+  auto result = cudf_io::read_csv(in_opts);
 
-  // const auto view = result.tbl->view();
-  // EXPECT_EQ(1, view.num_columns());
-  // ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id());
+  const auto view = result.tbl->view();
+  EXPECT_EQ(1, view.num_columns());
+  ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id());
 
-  // expect_column_data_equal(std::vector<int8_t>{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0));
+  expect_column_data_equal(std::vector<int8_t>{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0));
 }
 
 TEST_F(CsvReaderTest, InvalidFloatingPoint)

From d07ccfa9f4579e9eefc306e89e7fe8b56866ea2c Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 23 Jun 2021 08:14:43 -0700
Subject: [PATCH 38/54] test

---
 conda/recipes/cudf_kafka/meta.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 3506d118f07..635fb0f00af 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -29,6 +29,9 @@ requirements:
     - python
     - cython >=0.29,<0.30
     - setuptools
+    - pyarrow=4.0.1
+    - arrow-cpp=4.0.1
+    - arrow-cpp-proc * cuda
     - cudf {{ version }}
     - libcudf_kafka {{ version }}
   run:

From bec13ac2245b04de761c59ab769f36272568a17c Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 23 Jun 2021 14:55:52 -0700
Subject: [PATCH 39/54] test

---
 conda/recipes/cudf_kafka/meta.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 635fb0f00af..3506d118f07 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -29,9 +29,6 @@ requirements:
     - python
     - cython >=0.29,<0.30
     - setuptools
-    - pyarrow=4.0.1
-    - arrow-cpp=4.0.1
-    - arrow-cpp-proc * cuda
     - cudf {{ version }}
     - libcudf_kafka {{ version }}
   run:

From 726da201312fdd66e0a28e286ece1e9c09f19edf Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 23 Jun 2021 16:45:06 -0700
Subject: [PATCH 40/54] remove force uninstall of arrow

---
 ci/cpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 2c6cecf667b..91f7766cffb 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -41,7 +41,7 @@ env
 gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
-gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda'
+# gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda'
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
 gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda'
 

From 944d32278589164cfc1a873a3ae29146e4664b80 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 23 Jun 2021 18:58:35 -0700
Subject: [PATCH 41/54] change to conda

---
 ci/cpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 91f7766cffb..72de545147a 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -43,7 +43,7 @@ gpuci_logger "Activate conda env"
 conda activate rapids
 # gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda'
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
-gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda'
+conda install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda'
 
 # Remove rapidsai-nightly channel if we are building main branch
 if [ "$SOURCE_BRANCH" = "main" ]; then

From d37af801f44fb35aef1c9d921ddc872a9feabcc6 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 23 Jun 2021 19:05:39 -0700
Subject: [PATCH 42/54] version

---
 conda/recipes/libcudf_kafka/meta.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index f1ec813a17f..9d4ca561a45 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -2,6 +2,7 @@
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
 
 package:
   name: libcudf_kafka
@@ -25,7 +26,7 @@ requirements:
   build:
     - cmake >=3.20.1
   host:
-    - libcudf {{ version }}
+    - libcudf-{{version}}-cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
     - librdkafka >=1.5.0,<1.5.3
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not

From 0cb9ed4bf3376647c5b4cd027ec992faf83f96b9 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 23 Jun 2021 19:22:14 -0700
Subject: [PATCH 43/54] add build number

---
 conda/recipes/libcudf_kafka/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index 9d4ca561a45..b813aacec85 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -26,7 +26,7 @@ requirements:
   build:
     - cmake >=3.20.1
   host:
-    - libcudf-{{version}}-cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+    - libcudf {{version}} {{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
     - librdkafka >=1.5.0,<1.5.3
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not

From 9faa4f402fa24b8abc4efc3f96001d6846cbafa7 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 23 Jun 2021 19:59:31 -0700
Subject: [PATCH 44/54] conda

---
 conda/recipes/libcudf_kafka/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index b813aacec85..065d388902d 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -26,7 +26,7 @@ requirements:
   build:
     - cmake >=3.20.1
   host:
-    - libcudf {{version}} {{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+    - libcudf {{version}}-cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
     - librdkafka >=1.5.0,<1.5.3
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not

From 51ef74f5a481334e2e91f1ac5878c3c898ff2eec Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 23 Jun 2021 20:37:19 -0700
Subject: [PATCH 45/54] test

---
 conda/recipes/libcudf_kafka/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index 065d388902d..dae7b781cdc 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -26,7 +26,7 @@ requirements:
   build:
     - cmake >=3.20.1
   host:
-    - libcudf {{version}}-cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+    - libcudf {{version}} cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
     - librdkafka >=1.5.0,<1.5.3
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not

From 39f3f2e6dea7d5678db75ff285a77297aa8d0261 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 23 Jun 2021 21:17:01 -0700
Subject: [PATCH 46/54] unpin librdkafka

---
 conda/recipes/libcudf_kafka/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index dae7b781cdc..7cf72958957 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -27,7 +27,7 @@ requirements:
     - cmake >=3.20.1
   host:
     - libcudf {{version}} cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-    - librdkafka >=1.5.0,<1.5.3
+    - librdkafka
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not
 

From b8d33c8c46839736fca354527a4d2333e85ebd24 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 24 Jun 2021 09:17:35 -0700
Subject: [PATCH 47/54] bump librdkafka

---
 ci/gpu/build.sh                                     | 4 ++--
 conda/recipes/libcudf_kafka/meta.yaml               | 2 +-
 python/cudf/cudf/tests/test_cuda_array_interface.py | 3 +++
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 22cf6e67827..333c241f615 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -85,7 +85,7 @@ gpuci_conda_retry install -y \
 # https://docs.rapids.ai/maintainers/depmgmt/
 gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
-gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda'
+gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' "pyorc"
 
 
 gpuci_logger "Check compiler versions"
@@ -218,7 +218,7 @@ fi
 
 cd "$WORKSPACE/python/cudf"
 gpuci_logger "Python py.test for cuDF"
-py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term
+py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term
 
 cd "$WORKSPACE/python/dask_cudf"
 gpuci_logger "Python py.test for dask-cudf"
diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index 7cf72958957..a8ab6811f5a 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -27,7 +27,7 @@ requirements:
     - cmake >=3.20.1
   host:
     - libcudf {{version}} cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-    - librdkafka
+    - librdkafka >= 1.6.1,<1.7.0a0
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not
 
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 041dc0076f8..ecf961f133b 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -171,6 +171,9 @@ def test_column_from_ephemeral_cupy_try_lose_reference():
 
 def test_cuda_array_interface_pytorch():
     torch = pytest.importorskip("torch")
+    if not torch.cuda.is_available():
+        pytest.skip("need gpu version of pytorch to be installed")
+
     series = cudf.Series([1, -1, 10, -56])
     tensor = torch.tensor(series)
     got = cudf.Series(tensor)

From 2d188eee56971bcf72513e65f5acb9af08b2db3d Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 24 Jun 2021 10:37:11 -0700
Subject: [PATCH 48/54] refactor

---
 python/cudf/cudf/testing/_utils.py     |  33 ----
 python/cudf/cudf/tests/test_binops.py  |  16 +-
 python/cudf/cudf/tests/test_joining.py | 222 ++++++++++++++++++++++++-
 python/cudf/cudf/tests/test_string.py  | 191 ---------------------
 4 files changed, 226 insertions(+), 236 deletions(-)

diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 8d2679dd7aa..672e83e6f64 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -16,8 +16,6 @@
 from cudf.core.column.datetime import _numpy_to_pandas_conversion
 from cudf.utils import dtypes as dtypeutils
 
-_JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi")
-
 supported_numpy_dtypes = [
     "bool",
     "int8",
@@ -310,36 +308,5 @@ def does_not_raise():
     yield
 
 
-def assert_join_results_equal(expect, got, how, **kwargs):
-    if how not in _JOIN_TYPES:
-        raise ValueError(f"Unrecognized join type {how}")
-    if how == "right":
-        got = got[expect.columns]
-
-    if isinstance(expect, (pd.Series, cudf.Series)):
-        return assert_eq(
-            expect.sort_values().reset_index(drop=True),
-            got.sort_values().reset_index(drop=True),
-            **kwargs,
-        )
-    elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)):
-        if not len(
-            expect.columns
-        ):  # can't sort_values() on a df without columns
-            return assert_eq(expect, got, **kwargs)
-
-        assert_eq(
-            expect.sort_values(expect.columns.to_list()).reset_index(
-                drop=True
-            ),
-            got.sort_values(got.columns.to_list()).reset_index(drop=True),
-            **kwargs,
-        )
-    elif isinstance(expect, (pd.Index, cudf.Index)):
-        return assert_eq(expect.sort_values(), got.sort_values(), **kwargs)
-    else:
-        raise ValueError(f"Not a join result: {type(expect).__name__}")
-
-
 def xfail_param(param, **kwargs):
     return pytest.param(param, marks=pytest.mark.xfail(**kwargs))
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index cfd2ea5143f..1c97cbb10ff 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1742,12 +1742,6 @@ def test_binops_with_NA_consistent(dtype, op):
         assert result._column.null_count == len(data)
 
 
-def _decimal_series(input, dtype):
-    return cudf.Series(
-        [x if x is None else decimal.Decimal(x) for x in input], dtype=dtype,
-    )
-
-
 @pytest.mark.parametrize(
     "args",
     [
@@ -2080,10 +2074,10 @@ def _decimal_series(input, dtype):
 def test_binops_decimal(args):
     op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype = args
 
-    a = _decimal_series(lhs, l_dtype)
-    b = _decimal_series(rhs, r_dtype)
+    a = utils._decimal_series(lhs, l_dtype)
+    b = utils._decimal_series(rhs, r_dtype)
     expect = (
-        _decimal_series(expect, expect_dtype)
+        utils._decimal_series(expect, expect_dtype)
         if isinstance(expect_dtype, cudf.Decimal64Dtype)
         else cudf.Series(expect, dtype=expect_dtype)
     )
@@ -2258,7 +2252,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
     else:
         op, ldata, ldtype, rdata, _, expected = args
 
-    lhs = _decimal_series(ldata, ldtype)
+    lhs = utils._decimal_series(ldata, ldtype)
     rhs = cudf.Series(rdata, dtype=integer_dtype)
 
     if reflected:
@@ -2746,7 +2740,7 @@ def test_binops_decimal_scalar_compare(args, reflected):
     else:
         op, ldata, ldtype, rdata, _, expected = args
 
-    lhs = _decimal_series(ldata, ldtype)
+    lhs = utils._decimal_series(ldata, ldtype)
     rhs = rdata
 
     if reflected:
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 73b71bee77d..7b56f864272 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -12,7 +12,6 @@
     NUMERIC_TYPES,
     assert_eq,
     assert_exceptions_equal,
-    assert_join_results_equal,
 )
 
 _JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi")
@@ -72,6 +71,37 @@ def pd_odd_joins(left, right, join_type):
         return left[left.index.isin(right.index)][left.columns]
 
 
+def assert_join_results_equal(expect, got, how, **kwargs):
+    if how not in _JOIN_TYPES:
+        raise ValueError(f"Unrecognized join type {how}")
+    if how == "right":
+        got = got[expect.columns]
+
+    if isinstance(expect, (pd.Series, cudf.Series)):
+        return assert_eq(
+            expect.sort_values().reset_index(drop=True),
+            got.sort_values().reset_index(drop=True),
+            **kwargs,
+        )
+    elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)):
+        if not len(
+            expect.columns
+        ):  # can't sort_values() on a df without columns
+            return assert_eq(expect, got, **kwargs)
+
+        assert_eq(
+            expect.sort_values(expect.columns.to_list()).reset_index(
+                drop=True
+            ),
+            got.sort_values(got.columns.to_list()).reset_index(drop=True),
+            **kwargs,
+        )
+    elif isinstance(expect, (pd.Index, cudf.Index)):
+        return assert_eq(expect.sort_values(), got.sort_values(), **kwargs)
+    else:
+        raise ValueError(f"Not a join result: {type(expect).__name__}")
+
+
 @pytest.mark.parametrize("aa,bb,how,method", make_params())
 def test_dataframe_join_how(aa, bb, how, method):
     df = cudf.DataFrame()
@@ -1892,3 +1922,193 @@ def test_join_merge_invalid_keys(on, how):
     with pytest.raises(KeyError):
         pd_left.merge(pd_right, on=on)
         gd_left.merge(gd_right, on=on)
+
+
+@pytest.mark.parametrize(
+    "str_data",
+    [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]],
+)
+@pytest.mark.parametrize("num_keys", [1, 2, 3])
+@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
+def test_string_join_key(str_data, num_keys, how):
+    other_data = [1, 2, 3, 4, 5][: len(str_data)]
+
+    pdf = pd.DataFrame()
+    gdf = cudf.DataFrame()
+    for i in range(num_keys):
+        pdf[i] = pd.Series(str_data, dtype="str")
+        gdf[i] = cudf.Series(str_data, dtype="str")
+    pdf["a"] = other_data
+    gdf["a"] = other_data
+
+    pdf2 = pdf.copy()
+    gdf2 = gdf.copy()
+
+    expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how)
+    got = gdf.merge(gdf2, on=list(range(num_keys)), how=how)
+
+    if len(expect) == 0 and len(got) == 0:
+        expect = expect.reset_index(drop=True)
+        got = got[expect.columns]  # reorder columns
+
+    if how == "right":
+        got = got[expect.columns]  # reorder columns
+
+    assert_join_results_equal(expect, got, how=how)
+
+
+@pytest.mark.parametrize(
+    "str_data_nulls",
+    [
+        ["a", "b", "c"],
+        ["a", "b", "f", "g"],
+        ["f", "g", "h", "i", "j"],
+        ["f", "g", "h"],
+        [None, None, None, None, None],
+        [],
+    ],
+)
+def test_string_join_key_nulls(str_data_nulls):
+    str_data = ["a", "b", "c", "d", "e"]
+    other_data = [1, 2, 3, 4, 5]
+
+    other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)]
+
+    pdf = pd.DataFrame()
+    gdf = cudf.DataFrame()
+    pdf["key"] = pd.Series(str_data, dtype="str")
+    gdf["key"] = cudf.Series(str_data, dtype="str")
+    pdf["vals"] = other_data
+    gdf["vals"] = other_data
+
+    pdf2 = pd.DataFrame()
+    gdf2 = cudf.DataFrame()
+    pdf2["key"] = pd.Series(str_data_nulls, dtype="str")
+    gdf2["key"] = cudf.Series(str_data_nulls, dtype="str")
+    pdf2["vals"] = pd.Series(other_data_nulls, dtype="int64")
+    gdf2["vals"] = cudf.Series(other_data_nulls, dtype="int64")
+
+    expect = pdf.merge(pdf2, on="key", how="left")
+    got = gdf.merge(gdf2, on="key", how="left")
+    got["vals_y"] = got["vals_y"].fillna(-1)
+
+    if len(expect) == 0 and len(got) == 0:
+        expect = expect.reset_index(drop=True)
+        got = got[expect.columns]
+
+    expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64")
+
+    assert_join_results_equal(expect, got, how="left")
+
+
+@pytest.mark.parametrize(
+    "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
+)
+@pytest.mark.parametrize("num_cols", [1, 2, 3])
+@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
+def test_string_join_non_key(str_data, num_cols, how):
+    other_data = [1, 2, 3, 4, 5][: len(str_data)]
+
+    pdf = pd.DataFrame()
+    gdf = cudf.DataFrame()
+    for i in range(num_cols):
+        pdf[i] = pd.Series(str_data, dtype="str")
+        gdf[i] = cudf.Series(str_data, dtype="str")
+    pdf["a"] = other_data
+    gdf["a"] = other_data
+
+    pdf2 = pdf.copy()
+    gdf2 = gdf.copy()
+
+    expect = pdf.merge(pdf2, on=["a"], how=how)
+    got = gdf.merge(gdf2, on=["a"], how=how)
+
+    if len(expect) == 0 and len(got) == 0:
+        expect = expect.reset_index(drop=True)
+        got = got[expect.columns]
+
+    if how == "right":
+        got = got[expect.columns]  # reorder columns
+
+    assert_join_results_equal(expect, got, how=how)
+
+
+@pytest.mark.parametrize(
+    "str_data_nulls",
+    [
+        ["a", "b", "c"],
+        ["a", "b", "f", "g"],
+        ["f", "g", "h", "i", "j"],
+        ["f", "g", "h"],
+        [None, None, None, None, None],
+        [],
+    ],
+)
+def test_string_join_non_key_nulls(str_data_nulls):
+    str_data = ["a", "b", "c", "d", "e"]
+    other_data = [1, 2, 3, 4, 5]
+
+    other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)]
+
+    pdf = pd.DataFrame()
+    gdf = cudf.DataFrame()
+    pdf["vals"] = pd.Series(str_data, dtype="str")
+    gdf["vals"] = cudf.Series(str_data, dtype="str")
+    pdf["key"] = other_data
+    gdf["key"] = other_data
+
+    pdf2 = pd.DataFrame()
+    gdf2 = cudf.DataFrame()
+    pdf2["vals"] = pd.Series(str_data_nulls, dtype="str")
+    gdf2["vals"] = cudf.Series(str_data_nulls, dtype="str")
+    pdf2["key"] = pd.Series(other_data_nulls, dtype="int64")
+    gdf2["key"] = cudf.Series(other_data_nulls, dtype="int64")
+
+    expect = pdf.merge(pdf2, on="key", how="left")
+    got = gdf.merge(gdf2, on="key", how="left")
+
+    if len(expect) == 0 and len(got) == 0:
+        expect = expect.reset_index(drop=True)
+        got = got[expect.columns]
+
+    assert_join_results_equal(expect, got, how="left")
+
+
+def test_string_join_values_nulls():
+    left_dict = [
+        {"b": "MATCH 1", "a": 1.0},
+        {"b": "MATCH 1", "a": 1.0},
+        {"b": "LEFT NO MATCH 1", "a": -1.0},
+        {"b": "MATCH 2", "a": 2.0},
+        {"b": "MATCH 2", "a": 2.0},
+        {"b": "MATCH 1", "a": 1.0},
+        {"b": "MATCH 1", "a": 1.0},
+        {"b": "MATCH 2", "a": 2.0},
+        {"b": "MATCH 2", "a": 2.0},
+        {"b": "LEFT NO MATCH 2", "a": -2.0},
+        {"b": "MATCH 3", "a": 3.0},
+        {"b": "MATCH 3", "a": 3.0},
+    ]
+
+    right_dict = [
+        {"b": "RIGHT NO MATCH 1", "c": -1.0},
+        {"b": "MATCH 3", "c": 3.0},
+        {"b": "MATCH 2", "c": 2.0},
+        {"b": "RIGHT NO MATCH 2", "c": -2.0},
+        {"b": "RIGHT NO MATCH 3", "c": -3.0},
+        {"b": "MATCH 1", "c": 1.0},
+    ]
+
+    left_pdf = pd.DataFrame(left_dict)
+    right_pdf = pd.DataFrame(right_dict)
+
+    left_gdf = cudf.DataFrame.from_pandas(left_pdf)
+    right_gdf = cudf.DataFrame.from_pandas(right_pdf)
+
+    expect = left_pdf.merge(right_pdf, how="left", on="b")
+    got = left_gdf.merge(right_gdf, how="left", on="b")
+
+    expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
+    got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
+
+    assert_join_results_equal(expect, got, how="left")
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 8567c479e1c..3c153a16a13 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -23,7 +23,6 @@
     NUMERIC_TYPES,
     assert_eq,
     assert_exceptions_equal,
-    assert_join_results_equal,
 )
 from cudf.utils import dtypes as dtypeutils
 
@@ -919,196 +918,6 @@ def test_string_split(data, pat, n, expand):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize(
-    "str_data",
-    [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]],
-)
-@pytest.mark.parametrize("num_keys", [1, 2, 3])
-@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
-def test_string_join_key(str_data, num_keys, how):
-    other_data = [1, 2, 3, 4, 5][: len(str_data)]
-
-    pdf = pd.DataFrame()
-    gdf = cudf.DataFrame()
-    for i in range(num_keys):
-        pdf[i] = pd.Series(str_data, dtype="str")
-        gdf[i] = cudf.Series(str_data, dtype="str")
-    pdf["a"] = other_data
-    gdf["a"] = other_data
-
-    pdf2 = pdf.copy()
-    gdf2 = gdf.copy()
-
-    expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how)
-    got = gdf.merge(gdf2, on=list(range(num_keys)), how=how)
-
-    if len(expect) == 0 and len(got) == 0:
-        expect = expect.reset_index(drop=True)
-        got = got[expect.columns]  # reorder columns
-
-    if how == "right":
-        got = got[expect.columns]  # reorder columns
-
-    assert_join_results_equal(expect, got, how=how)
-
-
-@pytest.mark.parametrize(
-    "str_data_nulls",
-    [
-        ["a", "b", "c"],
-        ["a", "b", "f", "g"],
-        ["f", "g", "h", "i", "j"],
-        ["f", "g", "h"],
-        [None, None, None, None, None],
-        [],
-    ],
-)
-def test_string_join_key_nulls(str_data_nulls):
-    str_data = ["a", "b", "c", "d", "e"]
-    other_data = [1, 2, 3, 4, 5]
-
-    other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)]
-
-    pdf = pd.DataFrame()
-    gdf = cudf.DataFrame()
-    pdf["key"] = pd.Series(str_data, dtype="str")
-    gdf["key"] = cudf.Series(str_data, dtype="str")
-    pdf["vals"] = other_data
-    gdf["vals"] = other_data
-
-    pdf2 = pd.DataFrame()
-    gdf2 = cudf.DataFrame()
-    pdf2["key"] = pd.Series(str_data_nulls, dtype="str")
-    gdf2["key"] = cudf.Series(str_data_nulls, dtype="str")
-    pdf2["vals"] = pd.Series(other_data_nulls, dtype="int64")
-    gdf2["vals"] = cudf.Series(other_data_nulls, dtype="int64")
-
-    expect = pdf.merge(pdf2, on="key", how="left")
-    got = gdf.merge(gdf2, on="key", how="left")
-    got["vals_y"] = got["vals_y"].fillna(-1)
-
-    if len(expect) == 0 and len(got) == 0:
-        expect = expect.reset_index(drop=True)
-        got = got[expect.columns]
-
-    expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64")
-
-    assert_join_results_equal(expect, got, how="left")
-
-
-@pytest.mark.parametrize(
-    "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
-)
-@pytest.mark.parametrize("num_cols", [1, 2, 3])
-@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
-def test_string_join_non_key(str_data, num_cols, how):
-    other_data = [1, 2, 3, 4, 5][: len(str_data)]
-
-    pdf = pd.DataFrame()
-    gdf = cudf.DataFrame()
-    for i in range(num_cols):
-        pdf[i] = pd.Series(str_data, dtype="str")
-        gdf[i] = cudf.Series(str_data, dtype="str")
-    pdf["a"] = other_data
-    gdf["a"] = other_data
-
-    pdf2 = pdf.copy()
-    gdf2 = gdf.copy()
-
-    expect = pdf.merge(pdf2, on=["a"], how=how)
-    got = gdf.merge(gdf2, on=["a"], how=how)
-
-    if len(expect) == 0 and len(got) == 0:
-        expect = expect.reset_index(drop=True)
-        got = got[expect.columns]
-
-    if how == "right":
-        got = got[expect.columns]  # reorder columns
-
-    assert_join_results_equal(expect, got, how=how)
-
-
-@pytest.mark.parametrize(
-    "str_data_nulls",
-    [
-        ["a", "b", "c"],
-        ["a", "b", "f", "g"],
-        ["f", "g", "h", "i", "j"],
-        ["f", "g", "h"],
-        [None, None, None, None, None],
-        [],
-    ],
-)
-def test_string_join_non_key_nulls(str_data_nulls):
-    str_data = ["a", "b", "c", "d", "e"]
-    other_data = [1, 2, 3, 4, 5]
-
-    other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)]
-
-    pdf = pd.DataFrame()
-    gdf = cudf.DataFrame()
-    pdf["vals"] = pd.Series(str_data, dtype="str")
-    gdf["vals"] = cudf.Series(str_data, dtype="str")
-    pdf["key"] = other_data
-    gdf["key"] = other_data
-
-    pdf2 = pd.DataFrame()
-    gdf2 = cudf.DataFrame()
-    pdf2["vals"] = pd.Series(str_data_nulls, dtype="str")
-    gdf2["vals"] = cudf.Series(str_data_nulls, dtype="str")
-    pdf2["key"] = pd.Series(other_data_nulls, dtype="int64")
-    gdf2["key"] = cudf.Series(other_data_nulls, dtype="int64")
-
-    expect = pdf.merge(pdf2, on="key", how="left")
-    got = gdf.merge(gdf2, on="key", how="left")
-
-    if len(expect) == 0 and len(got) == 0:
-        expect = expect.reset_index(drop=True)
-        got = got[expect.columns]
-
-    assert_join_results_equal(expect, got, how="left")
-
-
-def test_string_join_values_nulls():
-    left_dict = [
-        {"b": "MATCH 1", "a": 1.0},
-        {"b": "MATCH 1", "a": 1.0},
-        {"b": "LEFT NO MATCH 1", "a": -1.0},
-        {"b": "MATCH 2", "a": 2.0},
-        {"b": "MATCH 2", "a": 2.0},
-        {"b": "MATCH 1", "a": 1.0},
-        {"b": "MATCH 1", "a": 1.0},
-        {"b": "MATCH 2", "a": 2.0},
-        {"b": "MATCH 2", "a": 2.0},
-        {"b": "LEFT NO MATCH 2", "a": -2.0},
-        {"b": "MATCH 3", "a": 3.0},
-        {"b": "MATCH 3", "a": 3.0},
-    ]
-
-    right_dict = [
-        {"b": "RIGHT NO MATCH 1", "c": -1.0},
-        {"b": "MATCH 3", "c": 3.0},
-        {"b": "MATCH 2", "c": 2.0},
-        {"b": "RIGHT NO MATCH 2", "c": -2.0},
-        {"b": "RIGHT NO MATCH 3", "c": -3.0},
-        {"b": "MATCH 1", "c": 1.0},
-    ]
-
-    left_pdf = pd.DataFrame(left_dict)
-    right_pdf = pd.DataFrame(right_dict)
-
-    left_gdf = cudf.DataFrame.from_pandas(left_pdf)
-    right_gdf = cudf.DataFrame.from_pandas(right_pdf)
-
-    expect = left_pdf.merge(right_pdf, how="left", on="b")
-    got = left_gdf.merge(right_gdf, how="left", on="b")
-
-    expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
-    got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
-
-    assert_join_results_equal(expect, got, how="left")
-
-
 @pytest.mark.parametrize(
     "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
 )

From 6405634a111e1b44600f079191d2eba3c77f6715 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 24 Jun 2021 11:13:43 -0700
Subject: [PATCH 49/54] pin

---
 conda/recipes/libcudf_kafka/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index a8ab6811f5a..ee86564bf2b 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -27,7 +27,7 @@ requirements:
     - cmake >=3.20.1
   host:
     - libcudf {{version}} cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-    - librdkafka >= 1.6.1,<1.7.0a0
+    - librdkafka >=1.6.0,<1.7.0a0
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not
 

From be35c9c125c4c7e3c081dcaaa4a21a4e24f1d52b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 24 Jun 2021 15:28:14 -0500
Subject: [PATCH 50/54] Update python/cudf/cudf/_lib/utils.pyx

Co-authored-by: Ram (Ramakrishna Prabhu) <42624703+rgsl888prabhu@users.noreply.github.com>
---
 python/cudf/cudf/_lib/utils.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 449d01357b0..e5dfb5a5c35 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -140,7 +140,7 @@ cpdef generate_pandas_metadata(Table table, index):
     metadata = pa.pandas_compat.construct_metadata(
         columns_to_convert=[
             col
-            for col in table._data.columns
+            for col in table._columns
         ],
         df=table,
         column_names=col_names,

From ff945bb20168a9e09bb8c19d06f306463b104d27 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 24 Jun 2021 21:01:19 -0700
Subject: [PATCH 51/54] add confluent-kafka

---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 333c241f615..12ee13433cf 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -85,7 +85,7 @@ gpuci_conda_retry install -y \
 # https://docs.rapids.ai/maintainers/depmgmt/
 gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
-gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' "pyorc"
+gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' "pyorc" "python-confluent-kafka>=1.3.0"
 
 
 gpuci_logger "Check compiler versions"

From db56768f19b379ed94cf69a0d1b6ad6758d7734f Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 25 Jun 2021 12:01:22 -0700
Subject: [PATCH 52/54] remove pin

---
 conda/recipes/libcudf_kafka/meta.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index ee86564bf2b..6b15890e7c7 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -2,7 +2,6 @@
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
 
 package:
   name: libcudf_kafka
@@ -26,7 +25,7 @@ requirements:
   build:
     - cmake >=3.20.1
   host:
-    - libcudf {{version}} cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+    - libcudf {{version}}
     - librdkafka >=1.6.0,<1.7.0a0
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not

From 915c5dff78fbe4485829279d2214c14f705bc79a Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 25 Jun 2021 14:42:04 -0700
Subject: [PATCH 53/54] fix typo

---
 python/cudf/cudf/tests/test_list.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index f0fc07ebe7e..a6a9ba97ef5 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -9,7 +9,7 @@
 import cudf
 from cudf import NA
 from cudf._lib.copying import get_element
-from cudf.tests._utils import (
+from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,

From e9515ba6e53fb91dd3f16f296d40c4342595beab Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 28 Jun 2021 07:00:38 -0700
Subject: [PATCH 54/54] revert local arrow installs

---
 ci/cpu/build.sh | 3 ---
 ci/gpu/build.sh | 3 +--
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 72de545147a..e11a0488624 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -41,9 +41,6 @@ env
 gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
-# gpuci_conda_retry remove -y --force arrow-cpp pyarrow 'arrow-cpp-proc=*=cuda'
-# gpuci_conda_retry install -y "your-pkg=1.0.0"
-conda install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda'
 
 # Remove rapidsai-nightly channel if we are building main branch
 if [ "$SOURCE_BRANCH" = "main" ]; then
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 12ee13433cf..c854e67fbdf 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -83,9 +83,8 @@ gpuci_conda_retry install -y \
                   "ucx-py=0.21.*"
 
 # https://docs.rapids.ai/maintainers/depmgmt/
-gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
+# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
 # gpuci_conda_retry install -y "your-pkg=1.0.0"
-gpuci_conda_retry install -y "arrow-cpp=4.0.1" "pyarrow=4.0.1" 'arrow-cpp-proc=*=cuda' "pyorc" "python-confluent-kafka>=1.3.0"
 
 
 gpuci_logger "Check compiler versions"