Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow to_pandas to return pandas.ArrowDtype #15182

Merged
merged 7 commits into from
Mar 4, 2024
10 changes: 9 additions & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@ def notna(self):
"""
raise NotImplementedError

def to_pandas(self, *, nullable: bool = False):
def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False):
"""
Convert to a Pandas Index.

Expand All @@ -924,6 +924,12 @@ def to_pandas(self, *, nullable: bool = False):
If ``nullable`` is ``False``, the resulting index will
either convert null values to ``np.nan`` or ``None``
depending on the dtype.
arrow_type : bool, Default False
Return the Index with a ``pandas.ArrowDtype``

Notes
-----
nullable and arrow_type cannot both be set to ``True``

Examples
--------
Expand All @@ -937,6 +943,8 @@ def to_pandas(self, *, nullable: bool = False):
<class 'pandas.core.indexes.base.Index'>
>>> type(idx)
<class 'cudf.core.index.Index'>
>>> idx.to_pandas(arrow_type=True)
Index([-3, 10, 15, 20], dtype='int64[pyarrow]')
"""
raise NotImplementedError

Expand Down
8 changes: 7 additions & 1 deletion python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,10 +770,16 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
)

def to_pandas(
self, *, index: Optional[pd.Index] = None, nullable: bool = False
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
elif arrow_type:
raise NotImplementedError(f"{arrow_type=} is not implemented.")

if self.categories.dtype.kind == "f":
new_mask = bools_to_mask(self.notnull())
Expand Down
21 changes: 16 additions & 5 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,20 +199,31 @@ def to_pandas(
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
"""Convert object to pandas type.

The default implementation falls back to PyArrow for the conversion.
"""
# This default implementation does not handle nulls in any meaningful
# way
if nullable:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
pd_series = self.to_arrow().to_pandas()
pa_array = self.to_arrow()
if arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(pa_array), index=index
)
else:
pd_series = pa_array.to_pandas()

if index is not None:
pd_series.index = index
return pd_series
if index is not None:
pd_series.index = index
return pd_series

@property
def values_host(self) -> "np.ndarray":
Expand Down
53 changes: 36 additions & 17 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,18 +318,27 @@ def to_pandas(
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
if nullable:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
# `copy=True` workaround until following issue is fixed:
# https://issues.apache.org/jira/browse/ARROW-9772

return pd.Series(
self.to_arrow(),
copy=True,
dtype=self.dtype,
index=index,
)
elif arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
)
else:
# `copy=True` workaround until following issue is fixed:
# https://issues.apache.org/jira/browse/ARROW-9772
return pd.Series(
self.to_arrow(),
copy=True,
dtype=self.dtype,
index=index,
)

@property
def values(self):
Expand Down Expand Up @@ -723,15 +732,25 @@ def to_pandas(
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
if nullable:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
series = self._local_time.to_pandas().dt.tz_localize(
self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
)
if index is not None:
series.index = index
return series
elif arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
)
else:
series = self._local_time.to_pandas().dt.tz_localize(
self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
)
if index is not None:
series.index = index
return series

def to_arrow(self):
return pa.compute.assume_timezone(
Expand Down
12 changes: 11 additions & 1 deletion python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,25 @@ def as_interval_column(self, dtype):
raise ValueError("dtype must be IntervalDtype")

def to_pandas(
self, *, index: Optional[pd.Index] = None, nullable: bool = False
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
# Note: This does not handle null values in the interval column.
# However, this exact sequence (calling __from_arrow__ on the output of
# self.to_arrow) is currently the best known way to convert interval
# types into pandas (trying to convert the underlying numerical columns
# directly is problematic), so we're stuck with this for now.
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
elif arrow_type:
raise NotImplementedError(f"{nullable=} is not implemented.")
return pd.Series(
self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index
)
Expand Down
11 changes: 10 additions & 1 deletion python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,8 +690,17 @@ def to_pandas(
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
if nullable and self.dtype in np_dtypes_to_pandas_dtypes:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
if arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
)
elif nullable and self.dtype in np_dtypes_to_pandas_dtypes:
pandas_nullable_dtype = np_dtypes_to_pandas_dtypes[self.dtype]
arrow_array = self.to_arrow()
pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)
Expand Down
11 changes: 10 additions & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5791,8 +5791,17 @@ def to_pandas(
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
if nullable:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
if arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
)
elif nullable:
pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow())
pd_series = pd.Series(pandas_array, copy=False)
else:
Expand Down
21 changes: 17 additions & 4 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,27 @@ def to_arrow(self):
)

def to_pandas(
self, *, index: Optional[pd.Index] = None, nullable: bool = False
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
# We cannot go via Arrow's `to_pandas` because of the following issue:
# https://issues.apache.org/jira/browse/ARROW-12680
if nullable:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")

return pd.Series(self.to_arrow().tolist(), dtype="object", index=index)
pa_array = self.to_arrow()
if arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(pa_array), index=index
)
else:
return pd.Series(pa_array.tolist(), dtype="object", index=index)

@cached_property
def memory_usage(self):
Expand Down
31 changes: 21 additions & 10 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,20 +147,31 @@ def to_arrow(self) -> pa.Array:
)

def to_pandas(
self, *, index: Optional[pd.Index] = None, nullable: bool = False
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
# `copy=True` workaround until following issue is fixed:
# https://issues.apache.org/jira/browse/ARROW-9772

if nullable:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")

return pd.Series(
self.to_arrow(),
copy=True,
dtype=self.dtype,
index=index,
)
elif arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
)
else:
return pd.Series(
self.to_arrow(),
copy=True,
dtype=self.dtype,
index=index,
)

def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
reflect, op = self._check_reflected_op(op)
Expand Down
22 changes: 18 additions & 4 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5203,7 +5203,9 @@ def describe(
return res

@_cudf_nvtx_annotate
def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
def to_pandas(
self, *, nullable: bool = False, arrow_type: bool = False
) -> pd.DataFrame:
"""
Convert to a Pandas DataFrame.

Expand All @@ -5218,11 +5220,17 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
If ``nullable`` is ``False``,
the resulting columns will either convert null
values to ``np.nan`` or ``None`` depending on the dtype.
arrow_type : bool, Default False
Return the Index with a ``pandas.ArrowDtype``

Returns
-------
out : Pandas DataFrame

Notes
-----
nullable and arrow_type cannot both be set to ``True``

Examples
--------
>>> import cudf
Expand All @@ -5236,8 +5244,7 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
>>> type(pdf)
<class 'pandas.core.frame.DataFrame'>

``nullable`` parameter can be used to control
whether dtype can be Pandas Nullable or not:
``nullable=True`` converts the result to pandas nullable types:

>>> df = cudf.DataFrame({'a': [0, None, 2], 'b': [True, False, None]})
>>> df
Expand Down Expand Up @@ -5265,13 +5272,20 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
a float64
b object
dtype: object

``arrow_type=True`` converts the result to ``pandas.ArrowDtype``:

>>> df.to_pandas(arrow_type=True).dtypes
a int64[pyarrow]
b bool[pyarrow]
dtype: object
"""
out_data = {}
out_index = self.index.to_pandas()

for i, col_key in enumerate(self._data):
out_data[i] = self._data[col_key].to_pandas(
index=out_index, nullable=nullable
index=out_index, nullable=nullable, arrow_type=arrow_type
)

out_df = pd.DataFrame(out_data, index=out_index)
Expand Down