Skip to content

Commit

Permalink
Preserve column type and class information in more DataFrame operatio…
Browse files Browse the repository at this point in the history
…ns (#15949)

Narrowing down to a pattern of using `ColumnAccessor._from_columns_like_self` to preserve the column information and then calling `Frame._from_data_like_self` to preserve the `.index`/`.name` information.

This is specifically for operations that operates column wise and the result should be the same shape as the input.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #15949
  • Loading branch information
mroeschke committed Jun 10, 2024
1 parent e3ba131 commit f9b0fc3
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 104 deletions.
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2688,6 +2688,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None:
self._data = ColumnAccessor(
data=dict(zip(other.names, self._data.columns)),
multiindex=other.multiindex,
rangeindex=other.rangeindex,
level_names=other.level_names,
label_dtype=other.label_dtype,
verify=False,
Expand Down Expand Up @@ -7534,7 +7535,7 @@ def _sample_axis_1(
def _from_columns_like_self(
self,
columns: List[ColumnBase],
column_names: abc.Iterable[str],
column_names: Optional[abc.Iterable[str]] = None,
index_names: Optional[List[str]] = None,
*,
override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
Expand Down
131 changes: 62 additions & 69 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@
from cudf.api.extensions import no_default
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
is_bool_dtype,
is_decimal_dtype,
is_dict_like,
is_list_like,
is_scalar,
Expand Down Expand Up @@ -372,7 +370,6 @@ def _mimic_inplace(
self._index = result.index
return super()._mimic_inplace(result, inplace)

# Scans
@_cudf_nvtx_annotate
def _scan(self, op, axis=None, skipna=True):
"""
Expand Down Expand Up @@ -417,8 +414,8 @@ def _scan(self, op, axis=None, skipna=True):
cast_to_int = op in ("cumsum", "cumprod")
skipna = True if skipna is None else skipna

results = {}
for name, col in self._data.items():
results = []
for col in self._columns:
if skipna:
result_col = col.nans_to_nulls()
else:
Expand All @@ -429,19 +426,14 @@ def _scan(self, op, axis=None, skipna=True):
else:
result_col = col

if (
cast_to_int
and not is_decimal_dtype(result_col.dtype)
and (
np.issubdtype(result_col.dtype, np.integer)
or np.issubdtype(result_col.dtype, np.bool_)
)
):
if cast_to_int and result_col.dtype.kind in "uib":
# For reductions that accumulate a value (e.g. sum, not max)
# pandas returns an int64 dtype for all int or bool dtypes.
result_col = result_col.astype(np.int64)
results[name] = getattr(result_col, op)()
return self._from_data(results, self.index)
results.append(getattr(result_col, op)())
return self._from_data_like_self(
self._data._from_columns_like_self(results)
)

def _check_data_index_length_match(self) -> None:
# Validate that the number of rows in the data matches the index if the
Expand Down Expand Up @@ -880,7 +872,6 @@ def replace(
FutureWarning,
)
if not (to_replace is None and value is no_default):
copy_data = {}
(
all_na_per_column,
to_replace_per_column,
Expand All @@ -890,10 +881,10 @@ def replace(
value=value,
columns_dtype_map=dict(self._dtypes),
)

copy_data = []
for name, col in self._data.items():
try:
copy_data[name] = col.find_and_replace(
replaced = col.find_and_replace(
to_replace_per_column[name],
replacements_per_column[name],
all_na_per_column[name],
Expand All @@ -906,11 +897,13 @@ def replace(
# that exists in `copy_data`.
# ii. There is an OverflowError while trying to cast
# `to_replace_per_column` to `replacements_per_column`.
copy_data[name] = col.copy(deep=True)
replaced = col.copy(deep=True)
copy_data.append(replaced)
result = self._from_data_like_self(
self._data._from_columns_like_self(copy_data)
)
else:
copy_data = self._data.copy(deep=True)

result = self._from_data(copy_data, self.index)
result = self.copy()

return self._mimic_inplace(result, inplace=inplace)

Expand Down Expand Up @@ -1031,12 +1024,13 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
):
lower[0], upper[0] = upper[0], lower[0]

data = {
name: col.clip(lower[i], upper[i])
for i, (name, col) in enumerate(self._data.items())
}
output = self._from_data(data, self.index)
output._copy_type_metadata(self, include_index=False)
data = (
col.clip(low, high)
for col, low, high in zip(self._columns, lower, upper)
)
output = self._from_data_like_self(
self._data._from_columns_like_self(data)
)
return self._mimic_inplace(output, inplace=inplace)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -1913,7 +1907,7 @@ def nans_to_nulls(self):
2 <NA> <NA>
"""
result = []
for col in self._data.columns:
for col in self._columns:
converted = col.nans_to_nulls()
if converted is col:
converted = converted.copy()
Expand Down Expand Up @@ -2028,8 +2022,8 @@ def interpolate(
)

interpolator = cudf.core.algorithms.get_column_interpolator(method)
columns = {}
for colname, col in data._data.items():
columns = []
for col in data._columns:
if isinstance(col, cudf.core.column.StringColumn):
warnings.warn(
f"{type(self).__name__}.interpolate with object dtype is "
Expand All @@ -2040,9 +2034,12 @@ def interpolate(
col = col.astype("float64").fillna(np.nan)

# Interpolation methods may or may not need the index
columns[colname] = interpolator(col, index=data.index)
columns.append(interpolator(col, index=data.index))

result = self._from_data(columns, index=data.index)
result = self._from_data_like_self(
self._data._from_columns_like_self(columns)
)
result.index = data.index

return (
result
Expand All @@ -2069,8 +2066,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
data_columns = (
col.shift(periods, fill_value) for col in self._columns
)
return self.__class__._from_data(
zip(self._column_names, data_columns), self.index
return self._from_data_like_self(
self._data._from_columns_like_self(data_columns)
)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -3011,8 +3008,6 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
self._column_names,
None if has_range_index or not keep_index else self.index.names,
)
result._data.label_dtype = self._data.label_dtype
result._data.rangeindex = self._data.rangeindex

if keep_index and has_range_index:
result.index = self.index[start:stop]
Expand Down Expand Up @@ -3561,11 +3556,6 @@ def sort_values(
),
keep_index=not ignore_index,
)
if (
isinstance(self, cudf.core.dataframe.DataFrame)
and self._data.multiindex
):
out.columns = self._data.to_pandas_index()
return out

def _n_largest_or_smallest(
Expand Down Expand Up @@ -3659,14 +3649,12 @@ def _align_to_index(
result = result.sort_values(sort_col_id)
del result[sort_col_id]

result = self.__class__._from_data(
data=result._data, index=result.index
out = self._from_data(
self._data._from_columns_like_self(result._columns)
)
result._data.multiindex = self._data.multiindex
result._data._level_names = self._data._level_names
result.index.names = self.index.names

return result
out.index = result.index
out.index.names = self.index.names
return out

@_cudf_nvtx_annotate
def _reindex(
Expand Down Expand Up @@ -3898,24 +3886,14 @@ def round(self, decimals=0, how="half_even"):
"decimals must be an integer, a dict-like or a Series"
)

cols = {
name: col.round(decimals[name], how=how)
if (
name in decimals
and _is_non_decimal_numeric_dtype(col.dtype)
and not is_bool_dtype(col.dtype)
)
cols = (
col.round(decimals[name], how=how)
if name in decimals and col.dtype.kind in "fiu"
else col.copy(deep=True)
for name, col in self._data.items()
}

return self.__class__._from_data(
data=cudf.core.column_accessor.ColumnAccessor(
cols,
multiindex=self._data.multiindex,
level_names=self._data.level_names,
),
index=self.index,
)
return self._from_data_like_self(
self._data._from_columns_like_self(cols)
)

def resample(
Expand Down Expand Up @@ -6238,6 +6216,8 @@ def rank(
f"axis={axis} is not yet supported in rank"
)

num_cols = self._num_columns
dropped_cols = False
source = self
if numeric_only:
if isinstance(
Expand All @@ -6255,15 +6235,28 @@ def rank(
source = self._get_columns_by_label(numeric_cols)
if source.empty:
return source.astype("float64")
elif source._num_columns != num_cols:
dropped_cols = True

result_columns = libcudf.sort.rank_columns(
[*source._columns], method_enum, na_option, ascending, pct
)

return self.__class__._from_data(
dict(zip(source._column_names, result_columns)),
index=source.index,
).astype(np.float64)
if dropped_cols:
result = type(source)._from_data(
ColumnAccessor(
dict(zip(source._column_names, result_columns)),
multiindex=self._data.multiindex,
level_names=self._data.level_names,
label_dtype=self._data.label_dtype,
),
)
else:
result = source._from_data_like_self(
self._data._from_columns_like_self(result_columns)
)
result.index = source.index
return result.astype(np.float64)

def convert_dtypes(
self,
Expand Down
41 changes: 8 additions & 33 deletions python/cudf/cudf/core/window/rolling.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION

import itertools

import numba
import pandas as pd
from pandas.api.indexers import BaseIndexer
Expand Down Expand Up @@ -251,27 +249,13 @@ def _apply_agg_column(self, source_column, agg_name):
agg_params=self.agg_params,
)

def _apply_agg_dataframe(self, df, agg_name):
return cudf.DataFrame._from_data(
{
col_name: self._apply_agg_column(col, agg_name)
for col_name, col in df._data.items()
},
index=df.index,
)

def _apply_agg(self, agg_name):
if isinstance(self.obj, cudf.Series):
return cudf.Series._from_data(
{
self.obj.name: self._apply_agg_column(
self.obj._column, agg_name
)
},
index=self.obj.index,
)
else:
return self._apply_agg_dataframe(self.obj, agg_name)
applied = (
self._apply_agg_column(col, agg_name) for col in self.obj._columns
)
return self.obj._from_data_like_self(
self.obj._data._from_columns_like_self(applied)
)

def _reduce(
self,
Expand Down Expand Up @@ -533,18 +517,9 @@ def _window_to_window_sizes(self, window):
)

def _apply_agg(self, agg_name):
index = cudf.MultiIndex.from_frame(
cudf.DataFrame(
{
key: value
for key, value in itertools.chain(
self._group_keys._data.items(),
self.obj.index._data.items(),
)
}
)
index = cudf.MultiIndex._from_data(
{**self._group_keys._data, **self.obj.index._data}
)

result = super()._apply_agg(agg_name)
result.index = index
return result
12 changes: 11 additions & 1 deletion python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10980,7 +10980,7 @@ def test_squeeze(axis, data):
assert_eq(result, expected)


@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)])
@pytest.mark.parametrize("column", [range(1, 2), np.array([1], dtype=np.int8)])
@pytest.mark.parametrize(
"operation",
[
Expand All @@ -10991,6 +10991,16 @@ def test_squeeze(axis, data):
lambda df: abs(df),
lambda df: -df,
lambda df: ~df,
lambda df: df.cumsum(),
lambda df: df.replace(1, 2),
lambda df: df.replace(10, 20),
lambda df: df.clip(0, 10),
lambda df: df.rolling(1).mean(),
lambda df: df.interpolate(),
lambda df: df.shift(),
lambda df: df.sort_values(1),
lambda df: df.round(),
lambda df: df.rank(),
],
)
def test_op_preserves_column_metadata(column, operation):
Expand Down

0 comments on commit f9b0fc3

Please sign in to comment.