rapidsai · rapids-bot · Mar 4, 2024 · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024
@@ -910,7 +910,7 @@ def notna(self):
         """
         raise NotImplementedError
 
-    def to_pandas(self, *, nullable: bool = False):
+    def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False):
         """
         Convert to a Pandas Index.
 
@@ -924,6 +924,12 @@ def to_pandas(self, *, nullable: bool = False):
             If ``nullable`` is ``False``, the resulting index will
             either convert null values to ``np.nan`` or ``None``
             depending on the dtype.
+        arrow_type : bool, Default False
+            Return the Index with a ``pandas.ArrowDtype``
+
+        Notes
+        -----
+        nullable and arrow_type cannot both be set to ``True``
 
         Examples
         --------
@@ -937,6 +943,8 @@ def to_pandas(self, *, nullable: bool = False):
         <class 'pandas.core.indexes.base.Index'>
         >>> type(idx)
         <class 'cudf.core.index.Index'>
+        >>> idx.to_pandas(arrow_type=True)
+        Index([-3, 10, 15, 20], dtype='int64[pyarrow]')
         """
         raise NotImplementedError
 

@@ -770,10 +770,16 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
         )
 
     def to_pandas(
-        self, *, index: Optional[pd.Index] = None, nullable: bool = False
+        self,
+        *,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
         if nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
+        elif arrow_type:
+            raise NotImplementedError(f"{arrow_type=} is not implemented.")
 
         if self.categories.dtype.kind == "f":
             new_mask = bools_to_mask(self.notnull())

@@ -199,20 +199,31 @@ def to_pandas(
         *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
         """Convert object to pandas type.
 
         The default implementation falls back to PyArrow for the conversion.
         """
         # This default implementation does not handle nulls in any meaningful
         # way
-        if nullable:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        elif nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
-        pd_series = self.to_arrow().to_pandas()
+        pa_array = self.to_arrow()
+        if arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(pa_array), index=index
+            )
+        else:
+            pd_series = pa_array.to_pandas()
 
-        if index is not None:
-            pd_series.index = index
-        return pd_series
+            if index is not None:
+                pd_series.index = index
+            return pd_series
 
     @property
     def values_host(self) -> "np.ndarray":

@@ -318,18 +318,27 @@ def to_pandas(
         *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
-        if nullable:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        elif nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
-        # `copy=True` workaround until following issue is fixed:
-        # https://issues.apache.org/jira/browse/ARROW-9772
-
-        return pd.Series(
-            self.to_arrow(),
-            copy=True,
-            dtype=self.dtype,
-            index=index,
-        )
+        elif arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
+            )
+        else:
+            # `copy=True` workaround until following issue is fixed:
+            # https://issues.apache.org/jira/browse/ARROW-9772
+            return pd.Series(
+                self.to_arrow(),
+                copy=True,
+                dtype=self.dtype,
+                index=index,
+            )
 
     @property
     def values(self):
@@ -723,15 +732,25 @@ def to_pandas(
         *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
-        if nullable:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        elif nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
-        series = self._local_time.to_pandas().dt.tz_localize(
-            self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
-        )
-        if index is not None:
-            series.index = index
-        return series
+        elif arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
+            )
+        else:
+            series = self._local_time.to_pandas().dt.tz_localize(
+                self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
+            )
+            if index is not None:
+                series.index = index
+            return series
 
     def to_arrow(self):
         return pa.compute.assume_timezone(

@@ -105,15 +105,25 @@ def as_interval_column(self, dtype):
             raise ValueError("dtype must be IntervalDtype")
 
     def to_pandas(
-        self, *, index: Optional[pd.Index] = None, nullable: bool = False
+        self,
+        *,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
         # Note: This does not handle null values in the interval column.
         # However, this exact sequence (calling __from_arrow__ on the output of
         # self.to_arrow) is currently the best known way to convert interval
         # types into pandas (trying to convert the underlying numerical columns
         # directly is problematic), so we're stuck with this for now.
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
         if nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
+        elif arrow_type:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
         return pd.Series(
             self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index
         )

@@ -690,8 +690,17 @@ def to_pandas(
         *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
-        if nullable and self.dtype in np_dtypes_to_pandas_dtypes:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        if arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
+            )
+        elif nullable and self.dtype in np_dtypes_to_pandas_dtypes:
             pandas_nullable_dtype = np_dtypes_to_pandas_dtypes[self.dtype]
             arrow_array = self.to_arrow()
             pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)

@@ -5791,8 +5791,17 @@ def to_pandas(
         *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
-        if nullable:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        if arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
+            )
+        elif nullable:
             pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow())
             pd_series = pd.Series(pandas_array, copy=False)
         else:

@@ -58,14 +58,27 @@ def to_arrow(self):
         )
 
     def to_pandas(
-        self, *, index: Optional[pd.Index] = None, nullable: bool = False
+        self,
+        *,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
         # We cannot go via Arrow's `to_pandas` because of the following issue:
         # https://issues.apache.org/jira/browse/ARROW-12680
-        if nullable:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        elif nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
-
-        return pd.Series(self.to_arrow().tolist(), dtype="object", index=index)
+        pa_array = self.to_arrow()
+        if arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(pa_array), index=index
+            )
+        else:
+            return pd.Series(pa_array.tolist(), dtype="object", index=index)
 
     @cached_property
     def memory_usage(self):

@@ -147,20 +147,31 @@ def to_arrow(self) -> pa.Array:
         )
 
     def to_pandas(
-        self, *, index: Optional[pd.Index] = None, nullable: bool = False
+        self,
+        *,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
         # `copy=True` workaround until following issue is fixed:
         # https://issues.apache.org/jira/browse/ARROW-9772
-
-        if nullable:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        elif nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
-
-        return pd.Series(
-            self.to_arrow(),
-            copy=True,
-            dtype=self.dtype,
-            index=index,
-        )
+        elif arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
+            )
+        else:
+            return pd.Series(
+                self.to_arrow(),
+                copy=True,
+                dtype=self.dtype,
+                index=index,
+            )
 
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         reflect, op = self._check_reflected_op(op)

@@ -5203,7 +5203,9 @@ def describe(
             return res
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
+    def to_pandas(
+        self, *, nullable: bool = False, arrow_type: bool = False
+    ) -> pd.DataFrame:
         """
         Convert to a Pandas DataFrame.
 
@@ -5218,11 +5220,17 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
             If ``nullable`` is ``False``,
             the resulting columns will either convert null
             values to ``np.nan`` or ``None`` depending on the dtype.
+        arrow_type : bool, Default False
+            Return the Index with a ``pandas.ArrowDtype``
 
         Returns
         -------
         out : Pandas DataFrame
 
+        Notes
+        -----
+        nullable and arrow_type cannot both be set to ``True``
+
         Examples
         --------
         >>> import cudf
@@ -5236,8 +5244,7 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
         >>> type(pdf)
         <class 'pandas.core.frame.DataFrame'>
 
-        ``nullable`` parameter can be used to control
-        whether dtype can be Pandas Nullable or not:
+        ``nullable=True`` converts the result to pandas nullable types:
 
         >>> df = cudf.DataFrame({'a': [0, None, 2], 'b': [True, False, None]})
         >>> df
@@ -5265,13 +5272,20 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
         a    float64
         b     object
         dtype: object
+
+        ``arrow_type=True`` converts the result to ``pandas.ArrowDtype``:
+
+        >>> df.to_pandas(arrow_type=True).dtypes
+        a    int64[pyarrow]
+        b     bool[pyarrow]
+        dtype: object
         """
         out_data = {}
         out_index = self.index.to_pandas()
 
         for i, col_key in enumerate(self._data):
             out_data[i] = self._data[col_key].to_pandas(
-                index=out_index, nullable=nullable
+                index=out_index, nullable=nullable, arrow_type=arrow_type
             )
 
         out_df = pd.DataFrame(out_data, index=out_index)